In [1]:
import os
import pandas as pd

In [2]:
SAMPLE_SUBMISSION = '/data/pinello/PROJECTS/2023_08_ZL/kaggle_scp/data/sample_submission.csv'
RESULTS_DIR = "/data/pinello/PROJECTS/2023_08_ZL/kaggle_scp/results/deep_tensor_factorization"
DE_TRAIN = "/data/pinello/PROJECTS/2023_08_ZL/kaggle_scp/data/de_train.parquet"
ID_MAP_CSV = "/data/pinello/PROJECTS/2023_08_ZL/kaggle_scp/data/id_map.csv"

In [3]:
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

In [4]:
def convert_to_long_df(df):
    df = df.drop(['sm_lincs_id', 'SMILES', 'control'], axis=1)
    df = pd.melt(df, id_vars=['cell_type', 'sm_name'], var_name='gene', value_name='target')
    
    return df

In [5]:
# prepare training and validation
df = pd.read_parquet(DE_TRAIN)

# Create training and validate datasets 
# Here, for each cell types in NK cells, T CD4+, T reg, and T CD8+, we
# use the test compounds as valication dataset
df_id_map = pd.read_csv(ID_MAP_CSV)
test_compunds = df_id_map['sm_name'].unique()

cell_type_names = {'NK cells': 'nk',
                   'T cells CD4+': 't_cd4',
                   'T cells CD8+': 't_cd8',
                   'T regulatory cells': 't_reg'}

In [6]:
for key, value in cell_type_names.items():
    print(key)
    df_train = df[(df['cell_type'] != key) | ~df['sm_name'].isin(test_compunds)]
    df_valid = df[(df['cell_type'] == key) & df['sm_name'].isin(test_compunds)]
    
    df_train = df_train.sort_values(['cell_type', 'sm_name'])
    df_valid = df_valid.sort_values('sm_name')
    
    df_train = convert_to_long_df(df_train)
    df_valid = convert_to_long_df(df_valid)
    
    df_train.to_csv(f'{RESULTS_DIR}/df_train_{value}.csv')
    df_valid.to_csv(f'{RESULTS_DIR}/df_valid_{value}.csv')

NK cells
T cells CD4+
T cells CD8+
T regulatory cells


In [9]:
# prepare test dataset
df_sample = pd.read_csv(SAMPLE_SUBMISSION, index_col=0)
df_test = pd.read_csv(ID_MAP_CSV, index_col=0)

df_sample['cell_type'] = df_test['cell_type']
df_sample['sm_name'] = df_test['sm_name']

df_test = pd.melt(df_sample, id_vars=['cell_type', 'sm_name'], var_name='gene', value_name='predict')

df_test.to_csv(os.path.join(RESULTS_DIR, 'test.csv'))