In [1]:
import pandas as pd
import numpy as np
import os

# Set current working directory
print(os.getcwd())
os.chdir('/share/data/analyses/christa/colopaint3D_fork/spher_colo52_v1')
print(os.getcwd())

/share/data/analyses/christa/colopaint3D_fork/2D_features
/share/data/analyses/christa/colopaint3D_fork/spher_colo52_v1


In [2]:
# Some function definitions

def list_features(df):
    # List features
    list_of_selected_features = list(df.columns.values)
    list_of_metadata = list(df.columns[df.columns.str.contains("Metadata_")])
    list_of_selected_features = list(set(list_of_selected_features) - set(list_of_metadata))
    return list_of_selected_features, list_of_metadata


# A function to agree on Metadata columns
rename_metadata_dict = {
    'Metadata_Barcode': 'Metadata_barcode', 
    'Metadata_Plate': 'Metadata_Plate',
    'Metadata_well_id': 'Metadata_Well',
    'Metadata_cmpd_plate_well':'Metadata_PlateWell',
    'Metadata_cmpd_pert_type':'Metadata_pert_type',
    'Metadata_cmpd_cmpdname':'Metadata_cmpdname',
    'Metadata_cmpd_target':'Metadata_target',
    'Metadata_cmpd_pathway':'Metadata_pathway',
}

miminal_metadata = ['Metadata_Well', 'Metadata_barcode', 'Metadata_PlateWell', 
                    'Metadata_cell_line', 'Metadata_cmpd_conc', 'Metadata_cmpdname', 
                    'Metadata_name', 'Metadata_pert_type', 'Metadata_pathway', 'Metadata_target']

def rename_metadata_columns(df, rename_metadata_dict):
    df.rename(columns = rename_metadata_dict, inplace = True)
    df = df.loc[:,~df.columns.duplicated()].copy()
    return df


# A function to check if metadata columns are present
def check_metadata_columns(df, minimal_metadata):
    metadata_columns = list(df.columns[df.columns.str.contains("Metadata_")])
    diff = set(minimal_metadata) - set(metadata_columns)
    return diff


### Combine all profiles

In [3]:
# Load 2D data
file1 = '../2D_features/selected_data_HT29.csv'
file2 = '../2D_features/selected_data_HCT116.csv'


data2D = pd.DataFrame()
tmp = pd.read_csv(file1, index_col=0)
tmp['Metadata_cell_line'] = 'HT29'
data2D = pd.concat([data2D, tmp], axis = 0)

tmp = pd.read_csv(file2, index_col=0)
tmp['Metadata_cell_line'] = 'HCT116'
data2D = pd.concat([data2D, tmp], axis = 0)

# Organize metadata columns
data2D = rename_metadata_columns(data2D, rename_metadata_dict)
print(check_metadata_columns(data2D, miminal_metadata))

# Add the missing metadata columns
data2D['Metadata_PlateWell'] = data2D['Metadata_Well'].astype(str) + '_' + data2D['Metadata_barcode'].astype(str)

# Keep only the minimal metadata columns
data2D = data2D[miminal_metadata + list_features(data2D)[0]]


{'Metadata_PlateWell'}


In [4]:
output_dir = "../spher_colo52_v1/1_Data/results/"

# Save as parquet
data2D.query("Metadata_cell_line == 'HCT116'").to_parquet(('{}selected_data_{}_{}.parquet').format(output_dir, '2D', 'HCT116'))

# Save as parquet
data2D.query("Metadata_cell_line == 'HT29'").to_parquet(('{}selected_data_{}_{}.parquet').format(output_dir, '2D', 'HT29'))