In [None]:
import pandas as pd
import numpy as np
import os
import sys

import scipy.stats as stats
import pickle

%pip install combat
%pip install combat --upgrade
from combat.pycombat import pycombat

from pyod.models.pca import PCA
from pyod.models.ecod import ECOD

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../src'))
sys.path.insert(0, module_path)

from breastcancerfiles import *
from utils import aggregate_df, plotPCA

# The results shown here are in whole or part based upon data generated by the TCGA Research Network: https://www.cancer.gov/tcga.

In [None]:
if not (os.path.exists(allpatientdfpath) and os.path.exists(patientcolumnspath)):
    print("Reading from file")
    allpatientsdf = pd.DataFrame()
    patientcolumns = []
    i = 1

    for folderpath in allfolders:
        print(folderpath)

        contains = os.listdir(folderpath)

        for f in os.listdir(folderpath):

            if os.path.isfile(os.path.join(folderpath, f)) and f.endswith(".tsv") and f != "annotations.txt":
                patientfile = os.path.join(folderpath, f)
                if i == 1: 
                    patientdf = pd.read_csv(patientfile, delimiter="\t", skiprows=[0,2,3,4,5], usecols=["gene_id","tpm_unstranded", "gene_name"])
                    patientdf.rename(columns={'tpm_unstranded': 'tpm_unstranded_'+str(i)}, inplace=True)
                    allpatientsdf = patientdf 
                    patientcolumns.append("tpm_unstranded_"+str(i))
                else: 
                    patientdf = pd.read_csv(patientfile, delimiter="\t", skiprows=[0,2,3,4,5], usecols=["gene_id","tpm_unstranded"])
                    patientdf.rename(columns={'tpm_unstranded': 'tpm_unstranded_'+str(i)}, inplace=True)
                    allpatientsdf = allpatientsdf.merge(patientdf, on='gene_id', how='inner')
                    patientcolumns.append("tpm_unstranded_"+str(i))


                i = i + 1
    
    filehandler = open(allpatientdfpath,"wb")
    pickle.dump(allpatientsdf,filehandler)
    filehandler.close()
    
    filehandler = open(patientcolumnspath, "wb")
    pickle.dump(patientcolumns, filehandler)
    filehandler.close()
    

file = open(allpatientdfpath,'rb')
allpatientsdf = pickle.load(file)
file.close()

file = open(patientcolumnspath,'rb')
patientcolumns = pickle.load(file)
file.close()


In [None]:
allpatientsdf["gene_id"] = allpatientsdf["gene_id"].apply(lambda x: x.split(".")[0])
ourdataoriginal["Geneid"] = ourdataoriginal["Geneid"].apply(lambda x: x.split(".")[0])

In [None]:
# using ECOD and PCA to remove outliers in the TCGA-BRCA dataset
if not (os.path.exists(breastcancerpatientscleanedpath)):
    # start working on a copy of allpatientsdf
    df = allpatientsdf[patientcolumns].T
    scores_df = df.copy()

    clf = ECOD(contamination=0.01) 
    clf.fit(df)
    scores_df['ECOD Scores'] = clf.predict(df)
    clean_df = df[scores_df['ECOD Scores'] == 0] 

    clf = PCA(contamination=0.02) 
    clf.fit(clean_df)
    pred = clf.predict(df) 
    cleaner_df = df[np.array(pred)==0]
    breastcancerpatientscleaned = list(cleaner_df.index) # the new list of breast cancer patients to use
    
    file = open(breastcancerpatientscleanedpath,"wb")
    pickle.dump(breastcancerpatientscleaned,file)
    file.close()
       

file = open(breastcancerpatientscleanedpath,'rb')
breastcancerpatientscleaned = pickle.load(file)
file.close()


In [None]:
mergeddf = pd.merge(allpatientsdf, ourdataoriginal, left_on="gene_id", right_on="Geneid", how="inner")
print("Library size of overlapping genes: " + str(len(mergeddf)) + "\nLibrary size of our data: " + str(len(ourdataoriginal)) + "\nLibrary size of the clinical TCGA data: " + str(len(allpatientsdf)))
print("Average transcript count remaining per sample: "+ str(mergeddf.drop(columns=["gene_id", "gene_name", "Geneid"]).sum(axis=0).mean()))

# there are batch effects present in the comparison
cellculturecolumns = ['D-Bewo-CT-M1', 'D-Bewo-CT-M2', 'D-Bewo-CT-M3', 'D-Bewo-CT-S1', 'D-Bewo-CT-S2', 'D-Bewo-CT-S3', 'D-Bewo-M1', 'D-Bewo-M2', 'D-Bewo-M3', 'D-Bewo-S1', 'D-Bewo-S2', 'D-Bewo-S3', 'D-MCF7-CT-M1', 'D-MCF7-CT-M2', 'D-MCF7-CT-M3', 'D-MCF7-CT-S1', 'D-MCF7-CT-S2', 'D-MCF7-CT-S3', 'D-MCF7-M1', 'D-MCF7-M2', 'D-MCF7-M3', 'D-MCF7-S1', 'D-MCF7-S2', 'D-MCF7-S3', 'S-Bewo-CT-M1', 'S-Bewo-CT-M2', 'S-Bewo-CT-M3', 'S-Bewo-CT-S1', 'S-Bewo-CT-S2', 'S-Bewo-CT-S3', 'S-MCF7-CT-M1', 'S-MCF7-CT-M2', 'S-MCF7-CT-M3', 'S-MCF7-CT-S1', 'S-MCF7-CT-S2', 'S-MCF7-CT-S3']

mergeddf        

In [None]:
bewocolumns = []
mcf7columns = []
for sample in cellculturecolumns:
    if "Bewo" in sample.split("-"):
        bewocolumns.append(sample)
    elif "MCF7" in sample.split("-"):
        mcf7columns.append(sample)

TPMcolumns = bewocolumns+mcf7columns+patientcolumns

if not (os.path.exists(bcdfpath)):
    print('bcdf generating')
    # from the mergeddf of TCGA data and our data, extract only our data and the cleaned list of TCGA patients to use
    bcdf = mergeddf[TPMcolumns+["gene_name", "gene_id"]].copy()

    # log-transform all the values
    bcdf[TPMcolumns] = mergeddf[TPMcolumns].apply(lambda x: np.log2(x + 1)) 

    file = open(bcdfpath,"wb")
    pickle.dump(bcdf,file)
    file.close()

file = open(bcdfpath,'rb')
bcdf = pickle.load(file)
file.close()


In [None]:
# adapted from the pyCombat documentation:
df_expression = bcdf.copy()[breastcancerpatientscleaned + mcf7columns + bewocolumns + ['gene_id', 'gene_name']]

# Set index
df_expression.set_index(['gene_id','gene_name'], inplace=True)

# Save original index for later
original_index = df_expression.index

dataset1 = allpatientsdf[breastcancerpatientscleaned]
dataset2 = ourdataoriginal.copy()[mcf7columns+bewocolumns]

# we generate the list of batches
batch = []
datasets = [dataset1, dataset2]
for j in range(len(datasets)):
    batch.extend([j for _ in range(len(datasets[j].columns))])

# Drop 0-variance
zero_var_mask = df_expression.var(axis=1) == 0
df_zero_var = df_expression[zero_var_mask]
df_variable = df_expression[~zero_var_mask]

# run pyComBat
df_corrected_var = pycombat(df_variable,batch)

df_corrected = pd.concat([df_corrected_var, df_zero_var], axis=0)
df_corrected = df_corrected.loc[original_index]

# Add gene names back as a column
df_corrected.reset_index(inplace=True)
#df_corrected.rename(columns={'index': 'Gene'}, inplace=True)

plotPCA(df_corrected, "PCA of TCGA BRCA vs MCF7 TPM Expression Data after pyCombat correction")

In [None]:
samplesdict = {"TCGA BRCA samples": breastcancerpatientscleaned, "D-MCF7-M":["D-MCF7-M1","D-MCF7-M2", "D-MCF7-M3"], 
               "D-MCF7-S":["D-MCF7-S1", "D-MCF7-S2", "D-MCF7-S3"], "D-MCF7-CT-S":["D-MCF7-CT-S1", "D-MCF7-CT-S2", "D-MCF7-CT-S3"],
                  "D-MCF7-CT-M":["D-MCF7-CT-M1", "D-MCF7-CT-M2", "D-MCF7-CT-M3"]}


def significance(row, populationcol, popmeancol):
    data = row[populationcol].astype(float).values  
    popmean = row[popmeancol].astype(float).mean()
    pvalue = stats.ttest_1samp(a=data, popmean=popmean)
    return pvalue[1]


Mphasecomparison = {"patients": breastcancerpatientscleaned, "cellcondition":["D-MCF7-M1", "D-MCF7-M2", "D-MCF7-M3"]}
Sphasecomparison = {"patients": breastcancerpatientscleaned, "cellcondition":["D-MCF7-S1", "D-MCF7-S2", "D-MCF7-S3"]}

bcdf_look = df_corrected[df_corrected['gene_name'].isin(breastcancergenes)].copy().groupby('gene_name').max() # very important. does not change the original and no unnecesary calcs

bcdf_look['p-value for TCGA vs D-MCF7-M'] = bcdf_look.apply(lambda row: 
                                          significance(row, Mphasecomparison["patients"], 
                                                            Mphasecomparison["cellcondition"]), 
                                                            axis=1)
bcdf_look['p-value for TCGA vs D-MCF7-S'] = bcdf_look.apply(lambda row: 
                                          significance(row, Sphasecomparison["patients"], 
                                                            Sphasecomparison["cellcondition"]), 
                                                            axis=1)


#breastcancerdf.to_excel(breastcancerdfresultspath)

# necessary when you re-run the above on ALL the genes. the duplicates are mostly non-coding genes
#duplicates = pd.concat(g for _, g in resultdf.groupby("gene_name") if len(g) > 1)
#duplicates["gene_name"].unique()



In [None]:
functionsdict = {"mean":'mean', "STD":np.std}
columnstokeepdict = {'p-value for TCGA vs D-MCF7-M':'p-value for TCGA vs D-MCF7-M',
                     'p-value for TCGA vs D-MCF7-S':'p-value for TCGA vs D-MCF7-S',
                     }

resultdf = aggregate_df(bcdf_look, samplesdict, functionsdict, columnstokeepdict)
resultdf

In [None]:
data = resultdf.copy().drop([
    ('p-value for TCGA vs D-MCF7-M', ''),
    ('p-value for TCGA vs D-MCF7-S', '')
], axis=1)

path = ""
writer = pd.ExcelWriter(path, engine='xlsxwriter')

for gene in data.index:
    df = data[data.index == gene]
    df.to_excel(writer, sheet_name=gene)

writer.close()