In [1]:
import pandas as pd
from collections import defaultdict

gene_expression_df = pd.read_excel("../data/raw/sample_gene_expression_neuroblastoma.xlsx")
gene_expression_desc = pd.read_excel("../data/raw/sample_gene_expression_neuroblastoma_README.xlsx")

In [2]:
gene_expression_df.columns

Index(['Unnamed: 0', 'Sample Type', 'Tumor', 'Tumor.1', 'Tumor.2', 'Tumor.3',
       'Tumor.4', 'Tumor.5', 'Tumor.6', 'Tumor.7',
       ...
       'NS.137', 'NS.138', 'NS.139', 'NS.140', 'NS.141', 'NS.142', 'NS.143',
       'NS.144', 'NS.145', 'NS.146'],
      dtype='object', length=395)

In [3]:
gene_expression_df

Unnamed: 0.1,Unnamed: 0,Sample Type,Tumor,Tumor.1,Tumor.2,Tumor.3,Tumor.4,Tumor.5,Tumor.6,Tumor.7,...,NS.137,NS.138,NS.139,NS.140,NS.141,NS.142,NS.143,NS.144,NS.145,NS.146
0,Ensembl ID,Gene Symbol,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPUNH_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPZYP_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPZYZ_Alive_High.Risk_Amplified,...,NS.ureter.NS171,NS.ureter.NS172,NS.ureter.NS173,NS.uterus.NS185,NS.uterus.NS177,NS.uterus.NS178,NS.uterus.NS180,NS.uterus.NS186,NS.uterus.NS184,NS.uterus.NS181
1,ENSG00000121410,A1BG,0.987836,1.458878,0.879737,1.759617,2.410172,1.917251,1.440319,1.785339,...,0.665534,0.879104,0.421156,0.660868,0.553003,0.490898,0.550141,0.569498,0.438346,0.407911
2,ENSG00000175899,A2M,4.771729,6.10215,3.064619,6.210056,6.710592,2.215601,6.152289,7.6245,...,6.363013,6.806819,6.139101,7.107079,6.54423,7.305137,6.714097,6.78923,6.761142,7.126564
3,ENSG00000166535,A2ML1,0,0,0.006537,0,0.003568,0.01766,0,0,...,0.091691,0.131466,0.150304,0.007645,0.058151,0.091356,0.068717,0.356167,0.094883,0.042002
4,ENSG00000128274,A4GALT,0.940922,2.478532,0.332491,2.228315,2.819369,0.35549,2.173267,3.358186,...,2.850943,4.178136,4.003085,3.406599,1.636608,2.337835,1.67405,4.443948,1.905303,2.331096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6127,ENSG00000204375,XAGE1E,0.015002,0,0,0,0.146121,0.153385,0.080812,0.139717,...,0,0,0,0,0,0,0,0,0,0
6128,ENSG00000185751,XAGE2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6129,ENSG00000171402,XAGE3,0,0,0,0,0,0.033376,0.03838,0,...,0,0,0,0,0,0,0,0,0,0
6130,ENSG00000171405,XAGE5,0,0,0,0,0,0.299215,0,0.045085,...,0,0,0,0,0,0,0,0,0,0


In [4]:
temp_df = pd.DataFrame(gene_expression_df.iloc[0][2:]).reset_index().rename(columns={"index": "Sample Type", 0: "Patient"})
temp_df["Sample Type"] = temp_df["Sample Type"].apply(lambda x: x.split(".")[0])
temp_df

Unnamed: 0,Sample Type,Patient
0,Tumor,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified
1,Tumor,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified
2,Tumor,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified
3,Tumor,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified
4,Tumor,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified
...,...,...
388,NS,NS.uterus.NS178
389,NS,NS.uterus.NS180
390,NS,NS.uterus.NS186
391,NS,NS.uterus.NS184


In [5]:
s = "NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified"
s.split(".")


['NB', 'MYCN', 'A', 'Tumor', 'PALZZV_Alive_High', 'Risk_Amplified']

In [6]:
# Schema
## Tumor
# Cancer.MYCN gene.Not amplified/Amplified.Sample Type.Patient ID.Status.Risk Level.Nonamplified/Amplified

## CellLine
# Cancer. MYCN gene. Amplified/Not-amplified.Sample Type.Patient ID)

## NS
# Normal sample. Location of tissue. Identification


def process_risk_level(x):         
    if x[-1][:4] == "Risk":
        x[-2] = x[-2] + x[-1]
        return x[:-1]
    if x[-2][:4] == "Risk":
        x[-2] = x[-2] + x[-1]
        x[-3] = x[-3] + x[-2]
        return x[:-2]
    if "Amplified" in x[-1]:
        if x[-2] != "Tumor":
            x[-2] = x[-2] + x[-1]
            return x[:-1]
    return x



In [7]:
temp_df["Patient_info_parsed"] = temp_df["Patient"].apply(lambda x: x.split("."))
temp_df["Patient_info_parsed"] = temp_df["Patient_info_parsed"].apply(lambda x: process_risk_level(x))


In [8]:
for id, row in temp_df.iterrows():
    print(row["Patient"], row['Patient_info_parsed'])

NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PALEVG_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PALKUC_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PALZZV_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PAMZGT_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PAPBJE_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PAPUNH_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PAPUNH_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PAPZYP_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PAPZYP_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PAPZYZ_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PAPZYZ_Alive_HighRisk_Amplified']
NB.MYCN.A.Tumor.PASSRN_Alive_High.Risk_Amplified ['NB', 'MYCN', 'A', 'Tumor', 'PASSRN_Alive_High

In [9]:
temp_df["Patient_info_parsed_length"] = temp_df["Patient_info_parsed"].apply(lambda x: len(x))
temp_df

Unnamed: 0,Sample Type,Patient,Patient_info_parsed,Patient_info_parsed_length
0,Tumor,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALEVG_Alive_HighRisk_Amp...",5
1,Tumor,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALKUC_Alive_HighRisk_Amp...",5
2,Tumor,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALZZV_Alive_HighRisk_Amp...",5
3,Tumor,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAMZGT_Alive_HighRisk_Amp...",5
4,Tumor,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAPBJE_Alive_HighRisk_Amp...",5
...,...,...,...,...
388,NS,NS.uterus.NS178,"[NS, uterus, NS178]",3
389,NS,NS.uterus.NS180,"[NS, uterus, NS180]",3
390,NS,NS.uterus.NS186,"[NS, uterus, NS186]",3
391,NS,NS.uterus.NS184,"[NS, uterus, NS184]",3


In [10]:
temp_df["Patient_info_parsed_length"].value_counts()

Patient_info_parsed_length
5    233
3    147
4     10
6      3
Name: count, dtype: int64

In [11]:
tumor_patients_df = temp_df[temp_df["Sample Type"] == "Tumor"]
tumor_patients_df

Unnamed: 0,Sample Type,Patient,Patient_info_parsed,Patient_info_parsed_length
0,Tumor,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALEVG_Alive_HighRisk_Amp...",5
1,Tumor,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALKUC_Alive_HighRisk_Amp...",5
2,Tumor,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALZZV_Alive_HighRisk_Amp...",5
3,Tumor,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAMZGT_Alive_HighRisk_Amp...",5
4,Tumor,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAPBJE_Alive_HighRisk_Amp...",5
...,...,...,...,...
201,Tumor,NB.Unknown.Tumor.NCI0017.T3,"[NB, Unknown, Tumor, NCI0017, T3]",5
202,Tumor,NB.Unknown.Tumor.NCI0017.T4,"[NB, Unknown, Tumor, NCI0017, T4]",5
203,Tumor,NB.Unknown.Tumor.NCI0108,"[NB, Unknown, Tumor, NCI0108]",4
204,Tumor,NB.Unknown.Tumor.NCI0117,"[NB, Unknown, Tumor, NCI0117]",4


In [12]:
tumor_patients_df["Patient_info_parsed_length"].value_counts()

# 

Patient_info_parsed_length
5    193
4     10
6      3
Name: count, dtype: int64

In [13]:
tumor_patients_df[tumor_patients_df["Patient_info_parsed_length"] == 5]["Patient_info_parsed"].iloc[65]

['NB', 'MYCN', 'NA', 'Tumor', 'PARACM_Alive_HighRisk_NotAmplified']

In [14]:
tumor_patients_df

Unnamed: 0,Sample Type,Patient,Patient_info_parsed,Patient_info_parsed_length
0,Tumor,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALEVG_Alive_HighRisk_Amp...",5
1,Tumor,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALKUC_Alive_HighRisk_Amp...",5
2,Tumor,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALZZV_Alive_HighRisk_Amp...",5
3,Tumor,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAMZGT_Alive_HighRisk_Amp...",5
4,Tumor,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAPBJE_Alive_HighRisk_Amp...",5
...,...,...,...,...
201,Tumor,NB.Unknown.Tumor.NCI0017.T3,"[NB, Unknown, Tumor, NCI0017, T3]",5
202,Tumor,NB.Unknown.Tumor.NCI0017.T4,"[NB, Unknown, Tumor, NCI0017, T4]",5
203,Tumor,NB.Unknown.Tumor.NCI0108,"[NB, Unknown, Tumor, NCI0108]",4
204,Tumor,NB.Unknown.Tumor.NCI0117,"[NB, Unknown, Tumor, NCI0117]",4


In [15]:
patient_info_list = []
patient_info_split_dict = defaultdict(int)
patient_info_split_dict_unknown = defaultdict(int)
for index, row in tumor_patients_df.iterrows():
    t_level = None
    d_or_r = None

    if row["Patient_info_parsed_length"] == 5: # This follows the schema for tumors
        cancer, mycn_gene, amp_or_non_amp_v1, sample_type, patient_info = row["Patient_info_parsed"]
        patient_info_split = patient_info.split("_")
        patient_info_split_dict[len(patient_info_split)] += 1 # To count the distribution
        
        if mycn_gene == "Unknown":
            # ['NB', 'Unknown', 'Tumor', 'NCI0017', 'T3']
            cancer, mycn_gene, sample_type, patient_info, t_level = row["Patient_info_parsed"]
            print("T level", t_level)
            amp_or_non_amp_v1 = None

        
        # if "Tumor" in amp_or_non_amp_v1:
        #     print(row["Patient"])
        #     print(row["Patient_info_parsed"])
        #     print("Found Tumor in amp_or_non_amp_v1")

        # print(row["Patient"])
        # print(row["Patient_info_parsed"])        
        # print("patient_info_split", patient_info_split)
    
        # List of patients to skip
        if row["Patient"] == "NB.MYCN.A.Tumor.PASHFA_D_Dead_Amplified":
            print("Skiping this patient [NB.MYCN.A.Tumor.PASHFA_D_Dead_Amplified] for further discussion") ## TODO: Discuss with the team on this patient
            continue
        if row["Patient"] == "NB.MYCN.A.Tumor.PASJZC_Alive_Amplified":
            print("Skiping this patient [NB.MYCN.A.Tumor.PASJZC_Alive_Amplified] for further discussion. This has patient_info length of 3. Missing Risk level") ## TODO: Discuss with the team on this patient
            continue
        
        # Patient Information may be of length 4 or 3 or 1
        if len(patient_info_split) == 4:
            print("Patient identification length is 4")
            patient_identification, status, risk_level, amp_or_non_amp = patient_info.split("_")
            if status in ["D", "R"]:
                print("Found D or R in risk_level")
                patient_identification, d_or_r, status, amp_or_non_amp = patient_info.split("_")
                risk_level = None
        elif len(patient_info_split) == 3:
            # print("Patient identification length is 3")
            patient_identification, status, amp_or_non_amp = patient_info.split("_") # Since missing rish level: For this patient specifically: NB.MYCN.A.Tumor.PASJZC_Alive_Amplified
            risk_level = None
        elif len(patient_info_split) == 1:
            # print("Patient identification length is 1")
            patient_identification = patient_info.split("_")[0]
            status, risk_level, amp_or_non_amp = None, None, None
        
        item = {
            "Patient": row["Patient"],
            "Patient_parsed": row["Patient_info_parsed"],
            "Cancer": cancer,
            "Tissue Location": None,
            "gene": mycn_gene,
            "Amplified/Not-amplified": amp_or_non_amp_v1,
            "Sample Type": sample_type,
            "Patient ID": patient_identification,
            "Status": status,
            "d_or_r": d_or_r,
            "Risk Level": risk_level,
            "Nonamplified/Amplified": amp_or_non_amp,
            "T_level": t_level if t_level is not None else None
        }
        patient_info_list.append(item)
    
    if row["Patient_info_parsed_length"] == 6:
        cancer, mycn_gene, amp_or_non_amp_v1, sample_type, patient_identification, t_level = row["Patient_info_parsed"]
        print(row["Patient"])
        print(row["Patient_info_parsed"])
        print("T level", t_level)
        if "Tumor" in amp_or_non_amp_v1:
            print("Found Tumor in amp_or_non_amp_v1")
        item = {
            "Patient": row["Patient"],
            "Patient_parsed": row["Patient_info_parsed"],
            "Cancer": cancer,
            "Tissue Location": None,
            "gene": mycn_gene,
            "Amplified/Not-amplified": amp_or_non_amp_v1,
            "Sample Type": sample_type,
            "Patient ID": patient_identification,
            "Status": status,
            "d_or_r": d_or_r,
            "Risk Level": risk_level,
            "Nonamplified/Amplified": amp_or_non_amp,
            "T_level": t_level
        }
        patient_info_list.append(item)
        
    if row["Patient_info_parsed_length"] == 4:
        cancer, unknown, sample_type, patient_info = row["Patient_info_parsed"]
        patient_info_split = patient_info.split("_")
        patient_info_split_dict_unknown[len(patient_info_split)] += 1 # To count the distribution
        
        ## TODO: Discuss this patient len(patient_info_split) = 4  # NB.Unknown.Tumor.PAPTCR_Dead_High.Risk_Unknown
        if len(patient_info_split) == 4:
            # print("Patient identification length is 4")
            # Eg. ['NB', 'Unknown', 'Tumor', 'PAPTCR_Dead_HighRisk_Unknown']
            patient_identification, status, risk_level, amp_or_non_amp = patient_info.split("_")
            item = {
                "Patient": row["Patient"],
                "Patient_parsed": row["Patient_info_parsed"],
                "Cancer": cancer,
                "Tissue Location": None,
                "gene": unknown,
                "Amplified/Not-amplified": None,
                "Sample Type": sample_type,
                "Patient ID": patient_identification,
                "Status": status,
                "d_or_r": d_or_r,
                "Risk Level": risk_level,
                "Nonamplified/Amplified": amp_or_non_amp,
                "T_level": None
            }
            patient_info_list.append(item)
        
        ## TODO: Discuss why some patients don't have MYCN as the gene # Eg. NB.Unknown.Tumor.NCI0017
        # print("Patient", row["Patient"])
        # print("patient_info_split", patient_info_split)
        if len(patient_info_split) == 1:
            # print("Patient identification length is 1")
            patient_identification = patient_info_split[0]
            item = {
                "Patient": row["Patient"],
                "Patient_parsed": row["Patient_info_parsed"],
                "Cancer": cancer,
                "Tissue Location": None,
                "gene": unknown,
                "Amplified/Not-amplified": None,
                "Sample Type": sample_type,
                "Patient ID": patient_identification,
                "Status": None,
                "d_or_r": d_or_r,
                "Risk Level": None,
                "Nonamplified/Amplified": None,
                "T_level": None
            }
            patient_info_list.append(item)
            



Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Skiping this patient [NB.MYCN.A.Tumor.PASJZC_Alive_Amplified] for further discussion. This has patient_info length of 3. Missing Risk level
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identification length is 4
Patient identificati

In [16]:
print("Distrituion of Patient_info_parsed_length for length = 6", patient_info_split_dict)
print("Distribution of Patient_info_parsed_length for length = 4", patient_info_split_dict_unknown)

Distrituion of Patient_info_parsed_length for length = 6 defaultdict(<class 'int'>, {4: 143, 3: 17, 1: 33})
Distribution of Patient_info_parsed_length for length = 4 defaultdict(<class 'int'>, {4: 1, 1: 9})


In [17]:
tumor_patient_info_list_df = pd.DataFrame(patient_info_list)
tumor_patient_info_list_df

Unnamed: 0,Patient,Patient_parsed,Cancer,Tissue Location,gene,Amplified/Not-amplified,Sample Type,Patient ID,Status,d_or_r,Risk Level,Nonamplified/Amplified,T_level
0,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALEVG_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALEVG,Alive,,HighRisk,Amplified,
1,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALKUC_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALKUC,Alive,,HighRisk,Amplified,
2,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALZZV_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALZZV,Alive,,HighRisk,Amplified,
3,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAMZGT_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PAMZGT,Alive,,HighRisk,Amplified,
4,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAPBJE_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PAPBJE,Alive,,HighRisk,Amplified,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,NB.Unknown.Tumor.NCI0017.T3,"[NB, Unknown, Tumor, NCI0017, T3]",NB,,Unknown,,Tumor,NCI0017,,,,,T3
200,NB.Unknown.Tumor.NCI0017.T4,"[NB, Unknown, Tumor, NCI0017, T4]",NB,,Unknown,,Tumor,NCI0017,,,,,T4
201,NB.Unknown.Tumor.NCI0108,"[NB, Unknown, Tumor, NCI0108]",NB,,Unknown,,Tumor,NCI0108,,,,,
202,NB.Unknown.Tumor.NCI0117,"[NB, Unknown, Tumor, NCI0117]",NB,,Unknown,,Tumor,NCI0117,,,,,


In [18]:
# patient_info_list_df["Cancer"].value_counts()
# patient_info_list_df["gene"].value_counts()
# patient_info_list_df["Amplified/Not-amplified"].value_counts()
# patient_info_list_df["Sample Type"].value_counts()
# patient_info_list_df["Patient ID"].value_counts()
# patient_info_list_df["Status"].value_counts()
# patient_info_list_df["d_or_r"].value_counts()
# patient_info_list_df["Risk Level"].value_counts()
# patient_info_list_df["Nonamplified/Amplified"].value_counts()
# patient_info_list_df["T_level"].value_counts()
## TODO: Identified some patients duplicates. Eg. PASNPG, PAUDDK etc. 198 unique patients with turmor found
## TODO: Discuss some patients with Status value as Alive, Dead, R, D etc.



In [19]:
# Schema
## Tumor
# Cancer.MYCN gene.Not amplified/Amplified.Sample Type.Patient ID.Status.Risk Level.Nonamplified/Amplified

## CellLine
# Cancer. MYCN gene. Amplified/Not-amplified.Sample Type.Patient ID)

## NS
# Normal sample. Location of tissue. Identification

# "Patient": row["Patient"],
# "Patient_parsed": row["Patient_info_parsed"],
# "Cancer": cancer,
# "gene": unknown,
# "Amplified/Not-amplified": None,
# "Sample Type": sample_type,
# "Patient ID": patient_identification,
# "Status": None,
# "d_or_r": d_or_r,
# "Risk Level": None,
# "Nonamplified/Amplified": None,
# "T_level": None

In [20]:
cellline_patients_df = temp_df[temp_df["Sample Type"] == "CellLine"]
cellline_patients_df

cellline_patient_info_list_df = None
cellline_patient_info_list = []

for id, row in cellline_patients_df.iterrows():
    cancer, mycn_gene, amp_or_non_amp_v1, sample_type, patient_identification = row["Patient_info_parsed"]
    item = {
        "Patient": row["Patient"],
        "Patient_parsed": row["Patient_info_parsed"],
        "Cancer": cancer,
        "Tissue Location": None,
        "gene": mycn_gene,
        "Amplified/Not-amplified": amp_or_non_amp_v1,
        "Sample Type": sample_type,
        "Patient ID": patient_identification,
        "Status": None,
        "d_or_r": None,
        "Risk Level": None,
        "Nonamplified/Amplified": None,
        "T_level": None
    }
    cellline_patient_info_list.append(item)

cellline_patient_info_list_df = pd.DataFrame(cellline_patient_info_list)

In [21]:
cellline_patient_info_list_df

Unnamed: 0,Patient,Patient_parsed,Cancer,Tissue Location,gene,Amplified/Not-amplified,Sample Type,Patient ID,Status,d_or_r,Risk Level,Nonamplified/Amplified,T_level
0,NB.MYCN.A.CellLine.CHP134,"[NB, MYCN, A, CellLine, CHP134]",NB,,MYCN,A,CellLine,CHP134,,,,,
1,NB.MYCN.A.CellLine.CHP-212,"[NB, MYCN, A, CellLine, CHP-212]",NB,,MYCN,A,CellLine,CHP-212,,,,,
2,NB.MYCN.A.CellLine.COGN415,"[NB, MYCN, A, CellLine, COGN415]",NB,,MYCN,A,CellLine,COGN415,,,,,
3,NB.MYCN.A.CellLine.COGN440,"[NB, MYCN, A, CellLine, COGN440]",NB,,MYCN,A,CellLine,COGN440,,,,,
4,NB.MYCN.A.CellLine.COGN453,"[NB, MYCN, A, CellLine, COGN453]",NB,,MYCN,A,CellLine,COGN453,,,,,
5,NB.MYCN.A.CellLine.COGN471,"[NB, MYCN, A, CellLine, COGN471]",NB,,MYCN,A,CellLine,COGN471,,,,,
6,NB.MYCN.A.CellLine.COGN496,"[NB, MYCN, A, CellLine, COGN496]",NB,,MYCN,A,CellLine,COGN496,,,,,
7,NB.MYCN.A.CellLine.COGN519,"[NB, MYCN, A, CellLine, COGN519]",NB,,MYCN,A,CellLine,COGN519,,,,,
8,NB.MYCN.A.CellLine.COGN561,"[NB, MYCN, A, CellLine, COGN561]",NB,,MYCN,A,CellLine,COGN561,,,,,
9,NB.MYCN.A.CellLine.COGN573,"[NB, MYCN, A, CellLine, COGN573]",NB,,MYCN,A,CellLine,COGN573,,,,,


In [22]:
# Schema
## Tumor
# Cancer.MYCN gene.Not amplified/Amplified.Sample Type.Patient ID.Status.Risk Level.Nonamplified/Amplified

## CellLine
# Cancer. MYCN gene. Amplified/Not-amplified.Sample Type.Patient ID)

## NS
# Normal sample. Location of tissue. Identification

In [23]:
ns_patients_df = temp_df[temp_df["Sample Type"] == "NS"]
ns_patients_df
ns_patient_info_list_df = None
ns_patient_info_list = []

for id, row in ns_patients_df.iterrows():
    cancer, tissue_location, patient_identification = row["Patient_info_parsed"]
    item = {
        "Patient": row["Patient"],
        "Patient_parsed": row["Patient_info_parsed"],
        "Cancer": cancer,
        "Tissue Location": tissue_location,
        "gene": None,
        "Amplified/Not-amplified": None,
        "Sample Type": row["Sample Type"],
        "Patient ID": patient_identification,
        "Status": None,
        "d_or_r": None,
        "Risk Level": None,
        "Nonamplified/Amplified": None,
        "T_level": None
    }
    ns_patient_info_list.append(item)

ns_patient_info_list_df = pd.DataFrame(ns_patient_info_list)
    

In [24]:
print(ns_patient_info_list_df.columns)
print(cellline_patient_info_list_df.columns)
print(tumor_patient_info_list_df.columns)

Index(['Patient', 'Patient_parsed', 'Cancer', 'Tissue Location', 'gene',
       'Amplified/Not-amplified', 'Sample Type', 'Patient ID', 'Status',
       'd_or_r', 'Risk Level', 'Nonamplified/Amplified', 'T_level'],
      dtype='object')
Index(['Patient', 'Patient_parsed', 'Cancer', 'Tissue Location', 'gene',
       'Amplified/Not-amplified', 'Sample Type', 'Patient ID', 'Status',
       'd_or_r', 'Risk Level', 'Nonamplified/Amplified', 'T_level'],
      dtype='object')
Index(['Patient', 'Patient_parsed', 'Cancer', 'Tissue Location', 'gene',
       'Amplified/Not-amplified', 'Sample Type', 'Patient ID', 'Status',
       'd_or_r', 'Risk Level', 'Nonamplified/Amplified', 'T_level'],
      dtype='object')


In [25]:
final_df = pd.concat([tumor_patient_info_list_df, cellline_patient_info_list_df, ns_patient_info_list_df])
final_df

Unnamed: 0,Patient,Patient_parsed,Cancer,Tissue Location,gene,Amplified/Not-amplified,Sample Type,Patient ID,Status,d_or_r,Risk Level,Nonamplified/Amplified,T_level
0,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALEVG_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALEVG,Alive,,HighRisk,Amplified,
1,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALKUC_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALKUC,Alive,,HighRisk,Amplified,
2,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PALZZV_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PALZZV,Alive,,HighRisk,Amplified,
3,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAMZGT_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PAMZGT,Alive,,HighRisk,Amplified,
4,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,"[NB, MYCN, A, Tumor, PAPBJE_Alive_HighRisk_Amp...",NB,,MYCN,A,Tumor,PAPBJE,Alive,,HighRisk,Amplified,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,NS.uterus.NS178,"[NS, uterus, NS178]",NS,uterus,,,NS,NS178,,,,,
143,NS.uterus.NS180,"[NS, uterus, NS180]",NS,uterus,,,NS,NS180,,,,,
144,NS.uterus.NS186,"[NS, uterus, NS186]",NS,uterus,,,NS,NS186,,,,,
145,NS.uterus.NS184,"[NS, uterus, NS184]",NS,uterus,,,NS,NS184,,,,,


In [29]:
# uncomment to save to data/preprocessed folder
# final_df.to_csv("../data/preprocessed/patient_info_metadata.csv", index=False)

In [36]:
gene_expression_df

Unnamed: 0.1,Unnamed: 0,Sample Type,Tumor,Tumor.1,Tumor.2,Tumor.3,Tumor.4,Tumor.5,Tumor.6,Tumor.7,...,NS.137,NS.138,NS.139,NS.140,NS.141,NS.142,NS.143,NS.144,NS.145,NS.146
0,Ensembl ID,Gene Symbol,NB.MYCN.A.Tumor.PALEVG_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PALKUC_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PALZZV_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAMZGT_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPBJE_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPUNH_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPZYP_Alive_High.Risk_Amplified,NB.MYCN.A.Tumor.PAPZYZ_Alive_High.Risk_Amplified,...,NS.ureter.NS171,NS.ureter.NS172,NS.ureter.NS173,NS.uterus.NS185,NS.uterus.NS177,NS.uterus.NS178,NS.uterus.NS180,NS.uterus.NS186,NS.uterus.NS184,NS.uterus.NS181
1,ENSG00000121410,A1BG,0.987836,1.458878,0.879737,1.759617,2.410172,1.917251,1.440319,1.785339,...,0.665534,0.879104,0.421156,0.660868,0.553003,0.490898,0.550141,0.569498,0.438346,0.407911
2,ENSG00000175899,A2M,4.771729,6.10215,3.064619,6.210056,6.710592,2.215601,6.152289,7.6245,...,6.363013,6.806819,6.139101,7.107079,6.54423,7.305137,6.714097,6.78923,6.761142,7.126564
3,ENSG00000166535,A2ML1,0,0,0.006537,0,0.003568,0.01766,0,0,...,0.091691,0.131466,0.150304,0.007645,0.058151,0.091356,0.068717,0.356167,0.094883,0.042002
4,ENSG00000128274,A4GALT,0.940922,2.478532,0.332491,2.228315,2.819369,0.35549,2.173267,3.358186,...,2.850943,4.178136,4.003085,3.406599,1.636608,2.337835,1.67405,4.443948,1.905303,2.331096
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6127,ENSG00000204375,XAGE1E,0.015002,0,0,0,0.146121,0.153385,0.080812,0.139717,...,0,0,0,0,0,0,0,0,0,0
6128,ENSG00000185751,XAGE2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6129,ENSG00000171402,XAGE3,0,0,0,0,0,0.033376,0.03838,0,...,0,0,0,0,0,0,0,0,0,0
6130,ENSG00000171405,XAGE5,0,0,0,0,0,0.299215,0,0.045085,...,0,0,0,0,0,0,0,0,0,0


In [37]:
ensembl_id_list = None
gene_list = None
patient_list = []
for id, row in gene_expression_df.T.reset_index().iterrows():
    if id == 0:
        ensembl_id_list = row[2:].tolist()
    if id == 1: 
        gene_list = row[2:].tolist()
    if id >= 2:
        patient_name = row[0]
        gene_expression = list(row[2:])        
        patient_list.append({patient_name: gene_expression})


In [38]:
patient_names = []
patient_gene_expression = []
for x in patient_list:
    for k, v in x.items():
        patient_names.append(k)
        patient_gene_expression.append(v)

In [40]:
gene_expression_df_per_patient =pd.DataFrame(patient_gene_expression)
gene_expression_df_per_patient.columns = gene_list
gene_expression_df_per_patient["Patient"] = patient_names
gene_expression_df_per_patient.columns
gene_expression_df_per_patient = gene_expression_df_per_patient[["Patient"] + [x for x in list(gene_expression_df_per_patient.columns) if x != "Patient"]]

In [41]:
# Uncomment to save the file
# gene_expression_df_per_patient.to_csv("../data/preprocessed/gene_expression_per_patient.csv", index=False)

In [42]:
output_df = pd.merge(final_df, gene_expression_df_per_patient, left_on="Patient", right_on="Patient", how="left")

In [44]:
# Uncomment to save the file
# output_df.to_csv("../data/preprocessed/patient_info_metadata_gene_expression.csv", index=False)