In [1]:
import os
import GEOparse as Geo
os.chdir('../../')
os.getcwd()

'/home/max/mcrc-cetuximab-analysis'

In [2]:
import pandas as pd

def decouple(s):
    ls = list(s.split("', '"))
    if len(ls) == 3:
        ls[0] = ls[0][2:]
        ls[2] = ls[2][:-2]
    elif len(ls) == 2:
        ls[0] = ls[0][2:]
        ls[1] = ls[1][:-2]
    dc = {s.split(': ')[0]: s.split(': ')[1] for s in ls}
    return dc

def extend_id(id):
    fp, sp = id.split('_')
    while len(sp) < 4:
        sp = '0' + sp
    return fp + '_' + sp

def pad_zeros(s, n):
    while len(s) < n:
        s = '0' + s
    return s

gse = Geo.get_GEO(filepath="raw/GSE183984_family.soft", silent=True)
print('Platform: ', gse.gpls)
for key in gse.gsms:
    ann = pd.DataFrame(columns=gse.gsms[key].metadata.keys())
    break

for key in gse.gsms:
    ann = pd.concat([ann, pd.DataFrame([gse.gsms[key].metadata], index=[key], dtype=str)])

print('Annotation size after collecting it from .SOFT: ', len(ann))

new_ann = pd.DataFrame(columns=['sample_id', 'tissue', 'time point', 'treatment response', 'patient_id', 'gsm_id', 'sample_title']).set_index('gsm_id')
for i in range(len(ann)):
    new_row = decouple(str(ann.iloc[i]['characteristics_ch1']))
    # print(new_row)
    title = str(ann.iloc[i]['title'])
    new_row['sample_id'] = extend_id(title[title.find('[', 1) + 1 : title.find(']')])
    new_row['sample_title'] = title[2:title.find(' ')]
    new_row['patient_id'] = (lambda s: pad_zeros(''.join(filter(lambda x: x.isdigit(), s)), 4))(new_row['sample_title'])
    new_ann = pd.concat([new_ann, pd.DataFrame([new_row], index=[ann.index[i]])])

ann = new_ann
ann['gsm_id'] = ann.index
ann = ann.set_index('sample_id')

print('Annotation size after creating new columns: ', len(ann))

print('Patient 5 indexes: ', ann.loc[ann['patient_id'] == '0005'].index)
ann = ann.loc[ann['patient_id'] != '0005']
print('Annotation size after deleting patient 0005: ', len(ann))

ann = ann.drop(['18R294_0001'])
# this is from patient 0005
# ann = ann.drop(['18R697_0005',])
print('Annotation size after deleting 18R697_0005, 18R294_0001: ', len(ann))

ann = ann.loc[~ann["tissue"].str.contains("metastases", case=False, na=False)]
print('Annotation size after deleting metastatic samples: ', len(ann))

# setting NaN to pre because it is known from another column (treatment response)
ann.loc[ann['time point'].isna(), 'time point'] = 'pre-treatment'

def convert_treatment_response(s):
    if 'non-PD' in s:
        return 'non-PD'
    elif 'PD' in s:
        return 'PD'
    elif s == 'pre-Tx':
        return 'unknown'
    else:
        raise Exception('unexpected treatment response in annotation row')

ann['treatment response'] = ann['treatment response'].apply(convert_treatment_response) 

treatment_response_for_patient = dict()

for i in range(len(ann)):
    row = ann.iloc[i]
    patient_id = row['patient_id']
    response = row['treatment response']
    time_point = row['time point']
    
    if time_point == 'pre-treatment' and response != 'unknown':
        raise Exception('known response in pre-treatment')
    
    if time_point == 'post-treatment' and response == 'unknown':
        raise Exception('unknown response in post-treatment')
    
    if response != 'unknown':
        if patient_id in treatment_response_for_patient and response != treatment_response_for_patient[patient_id]:
            raise Exception('contradictive post-treatment response for patient')
        
        treatment_response_for_patient[patient_id] = response

for i in range(len(ann)):
    row = ann.iloc[i]
    patient_id = row['patient_id']
    response = row['treatment response']
    
    if response == 'unknown' and patient_id in treatment_response_for_patient:
        ann.loc[ann.index[i],'treatment response'] = treatment_response_for_patient[patient_id]


Platform:  {'GPL16791': <d: GPL16791>}
Annotation size after collecting it from .SOFT:  113
Annotation size after creating new columns:  113
Patient 5 indexes:  Index(['18R689_0022', '18R690_0023', '18R697_0005', '18R176_0020'], dtype='object', name='sample_id')
Annotation size after deleting patient 0005:  109
Annotation size after deleting 18R697_0005, 18R294_0001:  108
Annotation size after deleting metastatic samples:  80


In [3]:
ann

Unnamed: 0_level_0,tissue,time point,treatment response,patient_id,sample_title,gsm_id
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
18R649_0010,primary tumor (colorectum),post-treatment,non-PD,0001,POST-R-001,GSM5575333
18R654_0015,primary tumor (colorectum),pre-treatment,non-PD,0001,PAIR-R-001-pre,GSM5575334
18R669_0001,primary tumor (colorectum),post-treatment,non-PD,0002,POST-R-002,GSM5575336
18R670_0002,primary tumor (colorectum),pre-treatment,non-PD,0002,PAIR-R-002,GSM5575337
18R674_0006,primary tumor (colorectum),pre-treatment,unknown,0003,PAIR-R-003,GSM5575339
...,...,...,...,...,...,...
18R454_0027,primary tumor (colorectum),pre-treatment,unknown,0051,PRE-R-051,GSM5575441
18R477_0019,primary tumor (colorectum),pre-treatment,unknown,0052,PRE-R-052,GSM5575442
18R507_0004,primary tumor (colorectum),pre-treatment,non-PD,0010,PRE-10,GSM5575443
18R428_0008,primary tumor (colorectum),pre-treatment,unknown,0054,PRE-R-054,GSM5575444


In [4]:
ann['tissue'].value_counts()

tissue
primary tumor (colorectum)    80
Name: count, dtype: int64

In [5]:
ann['time point'].value_counts()

time point
pre-treatment     67
post-treatment    13
Name: count, dtype: int64

In [6]:
ann['treatment response'].value_counts()

treatment response
unknown    42
non-PD     27
PD         11
Name: count, dtype: int64

In [7]:
ann['patient_id'].value_counts().sort_index()

patient_id
0001    3
0002    3
0003    1
0004    3
0006    3
0007    1
0008    3
0009    1
0010    3
0011    2
0012    3
0013    3
0014    3
0015    4
0016    2
0017    1
0018    1
0019    1
0020    2
0021    3
0022    1
0023    1
0024    1
0026    1
0027    1
0028    1
0029    2
0030    1
0031    1
0032    2
0033    1
0034    1
0035    1
0036    1
0037    1
0038    1
0040    1
0041    1
0042    1
0043    1
0044    1
0045    1
0046    1
0047    1
0048    1
0049    1
0050    1
0051    1
0052    1
0054    1
0055    1
0061    1
Name: count, dtype: int64

In [8]:
ann[ann['time point'] == 'post-treatment']['treatment response'].value_counts()

treatment response
non-PD    9
PD        4
Name: count, dtype: int64

In [9]:
ann[ann['time point'] == 'pre-treatment']['treatment response'].value_counts()

treatment response
unknown    42
non-PD     18
PD          7
Name: count, dtype: int64

In [10]:
ann.to_csv('data/ann_maxim.csv')