In [1]:
import pandas as pd
import numpy as np
import os

# Set current working directory
print(os.getcwd())
os.chdir('/share/data/analyses/christa/colopaint3D/spher_colo52_v1')
print(os.getcwd())

/share/data/analyses/christa/colopaint3D/spher_colo52_v1
/share/data/analyses/christa/colopaint3D/spher_colo52_v1


In [2]:
# Some function definitions

def list_features(df):
    # List features
    list_of_selected_features = list(df.columns.values)
    list_of_metadata = list(df.columns[df.columns.str.contains("Metadata_")])
    list_of_selected_features = list(set(list_of_selected_features) - set(list_of_metadata))
    return list_of_selected_features, list_of_metadata


# A function to agree on Metadata columns
rename_metadata_dict = {
    'Metadata_Barcode': 'Metadata_barcode', 
    'Metadata_Plate': 'Metadata_Plate',
    'Metadata_well_id': 'Metadata_Well',
    'Metadata_cmpd_plate_well':'Metadata_PlateWell',
    'Metadata_cmpd_pert_type':'Metadata_pert_type',
    'Metadata_cmpd_cmpdname':'Metadata_cmpdname',
    'Metadata_cmpd_target':'Metadata_target',
    'Metadata_cmpd_pathway':'Metadata_pathway',
}

miminal_metadata = ['Metadata_Well', 'Metadata_barcode', 'Metadata_PlateWell', 
                    'Metadata_cell_line', 'Metadata_cmpd_conc', 'Metadata_cmpdname', 
                    'Metadata_name', 'Metadata_pert_type', 'Metadata_pathway', 'Metadata_target']

def rename_metadata_columns(df, rename_metadata_dict):
    df.rename(columns = rename_metadata_dict, inplace = True)
    df = df.loc[:,~df.columns.duplicated()].copy()
    return df


# A function to check if metadata columns are present
def check_metadata_columns(df, minimal_metadata):
    metadata_columns = list(df.columns[df.columns.str.contains("Metadata_")])
    diff = set(minimal_metadata) - set(metadata_columns)
    return diff


### Combine all profiles

In [3]:
# Load MIP data

file = '1_FeaturesMIP/selected_data_MIP.csv' #

# Load the parquet file and select the cell line of interest
dataMIP = pd.read_csv(file, sep = ',', index_col = 0)  

# Organize metadata columns
dataMIP = rename_metadata_columns(dataMIP, rename_metadata_dict)
print(check_metadata_columns(dataMIP, miminal_metadata))

# Add the missing metadata columns
dataMIP['Metadata_PlateWell'] = dataMIP['Metadata_Well'].astype(str) + '_' + dataMIP['Metadata_barcode'].astype(str)

# Keep only the minimal metadata columns
dataMIP = dataMIP[miminal_metadata + list_features(dataMIP)[0]]

# Add which datatype 
dataMIP['Metadata_data_type'] = 'MIP'


{'Metadata_PlateWell'}


In [4]:
# Load the aggregated data

# List files
dir = '1_FeaturesImages_none'
files = os.listdir('1_FeaturesImages_none' )

data = pd.DataFrame()
for f in files: 
    if f.endswith('.csv'):
        print(f)
        tmp = pd.read_csv(dir + '/' + f, sep = ',')  
        tmp['Metadata_cell_line'] = f.split('.')[0].split('_',)[2]
        data = pd.concat([data, tmp], axis = 0)

# Organize metadata columns
data = rename_metadata_columns(data, rename_metadata_dict)
print(check_metadata_columns(data, miminal_metadata))

# Keep only the minimal metadata columns
data = data[miminal_metadata + list_features(data)[0]]

# Add which datatype 
data['Metadata_data_type'] = 'aggregates'

selected_data_HT29.csv
selected_data_HCT116.csv
set()


In [5]:
# # Load 2D data
# dir_2D = '../../colopaint2D/colo52-analysis-full/results-medmean/'
# files_2D = os.listdir(dir_2D)

# data2D = pd.DataFrame()
# for f in files_2D: 
#     if f.endswith('.csv'): 
#         if 'selected_data_HT29' in f:
#             tmp = pd.read_csv(dir_2D + '/' + f, sep = ',')  
#             tmp['Metadata_cell_line'] = f.split('.')[0].split('_',)[2]
#             print(f)
#             data2D = pd.concat([data2D, tmp], axis = 0)
#         elif 'selected_data_HCT116' in f:
#             tmp = pd.read_csv(dir_2D + '/' + f, sep = ',')  
#             tmp['Metadata_cell_line'] = f.split('.')[0].split('_',)[2]
#             print(f)
#             data2D = pd.concat([data2D, tmp], axis = 0)

# # # Organize metadata columns
# # data2D = rename_metadata_columns(data2D, rename_metadata_dict)
# # print(check_metadata_columns(data2D, miminal_metadata))

# # # Add the missing metadata columns
# # data2D['Metadata_PlateWell'] = data2D['Metadata_Well'].astype(str) + '_' + data2D['Metadata_barcode'].astype(str)

# # # Keep only the minimal metadata columns
# # data2D = data2D[miminal_metadata + list_features(data2D)[0]]

# # # Add which datatype 
# # data2D['Metadata_data_type'] = '2D'

# print(data.shape, 
# data2D.shape, 
# dataMIP.shape)



In [6]:
# Load 2D data
file1 = '../../colopaint2D/colo52-analysis-full/results-medmean/selected_data_HT29.csv'
file2 = '../../colopaint2D/colo52-analysis-full/results-medmean/selected_data_HCT116.csv'


data2D = pd.DataFrame()
tmp = pd.read_csv(file1, index_col=0)
tmp['Metadata_cell_line'] = 'HT29'
data2D = pd.concat([data2D, tmp], axis = 0)

tmp = pd.read_csv(file2, index_col=0)
tmp['Metadata_cell_line'] = 'HCT116'
data2D = pd.concat([data2D, tmp], axis = 0)

# Organize metadata columns
data2D = rename_metadata_columns(data2D, rename_metadata_dict)
print(check_metadata_columns(data2D, miminal_metadata))

# Add the missing metadata columns
data2D['Metadata_PlateWell'] = data2D['Metadata_Well'].astype(str) + '_' + data2D['Metadata_barcode'].astype(str)

# Keep only the minimal metadata columns
data2D = data2D[miminal_metadata + list_features(data2D)[0]]

# Add which datatype 
data2D['Metadata_data_type'] = '2D'


print(data.shape, 
data2D.shape, 
dataMIP.shape)

{'Metadata_PlateWell'}
(1607, 790) (2391, 1104) (1460, 485)


In [7]:
# Merge the data
dataset = pd.concat([data, data2D, dataMIP], axis = 0, join='outer', ignore_index=True)

dataset.shape

(5458, 2357)

In [8]:
# Avoid some headaches later on. Replace NaN with np.nan
dataset  = dataset.fillna(np.nan)

In [9]:
# Save the dataset

# Save the data
dir_save = '../'
dataset.to_csv(('selected_data_all.csv').format(dir_save), index=False)

In [10]:
#### Which compounds are in the missing wells? Make a table

In [11]:
# Load metadata (I am missing the concentrations)
metadata = pd.read_csv('spher_colo52-metadata.csv')

In [20]:

metadata['PlateWell'] = metadata.well_id.astype(str) + '_' + metadata.barcode

missing_wells = list(set(metadata.PlateWell.unique()) - set(data.Metadata_PlateWell.unique()))
missing_data = metadata[metadata.PlateWell.isin(missing_wells)]

list_missing = missing_data.groupby(['cell_line','cmpdname','cmpd_conc'])['PlateWell'].value_counts()

In [13]:
print(len(metadata.plate_well))
print(len(data.Metadata_PlateWell.unique()))
print(len(data.Metadata_PlateWell))
print(len(missing_wells))
print(len(missing_data))

1848
1607
1607
241
241


In [33]:
missing_data.query('cell_line == "HCT116"').value_counts('cmpdname').head(20)

cmpdname
Bortezomib (PS-341)                            9
Cobimetinib (GDC-0973, RG7420)                 8
5Z-7-OXOZEAENOL                                5
SN-38                                          5
Crizotinib (PF-02341066)                       5
Palbociclib (PD-0332991) HCl                   5
Vorinostat (SAHA, MK0683)                      4
AZD7762                                        4
Binimetinib (MEK162, ARRY-162, ARRY-438162)    3
Paclitaxel                                     3
PD0325901                                      3
Oxaliplatin                                    3
abemaciclib (LY2835219)                        2
Trametinib (GSK1120212)                        2
Linsitinib (OSI-906)                           2
Vinorelbine Tartrate                           2
Encorafenib (LGX818)                           2
stau                                           2
dmso                                           2
Adavosertib （MK-1775）                          2
dtype: int6

In [34]:
missing_data.query('cell_line == "HT29"').value_counts('cmpdname').head(20)

cmpdname
Vinorelbine Tartrate                           12
Adavosertib （MK-1775）                          11
Bortezomib (PS-341)                            10
Cobimetinib (GDC-0973, RG7420)                  9
Paclitaxel                                      8
5Z-7-OXOZEAENOL                                 6
Tanespimycin (17-AAG)                           6
MK-2206 2HCl                                    6
Crizotinib (PF-02341066)                        6
PD0325901                                       6
AZD7762                                         5
Vorinostat (SAHA, MK0683)                       5
BMS-754807                                      5
Afatinib (BIBW2992)                             4
Encorafenib (LGX818)                            4
dmso                                            4
fenb                                            3
Vemurafenib (PLX4032, RG7204)                   3
Rabusertib (LY2603618)                          3
Binimetinib (MEK162, ARRY-162, ARRY-43816

In [None]:
list_missing.to_csv('missing_table.csv')

In [38]:

a = missing_data.value_counts('barcode')

In [42]:
np.mean(100 - (a / 308) *100)
np.std(100 - (a / 308) *100)

3.982698687155238