## Mesenchymal State DE Genes, Correlated with ODE Parameters

In [1]:
import os
import csv
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
import shutil

In [2]:
currStateDir = os.getcwd()

In [3]:
currStateDir

'/Users/meilumcd/Desktop/EMT-in-cancer/Cross-Dataset Comparisons/Mesenchymal State Genes'

In [24]:
# File & folder names

currState = "Mesenchymal"

dataFolder = currStateDir+"/Data - Log2FC from Conditions & Genes"
outputFolder = currStateDir+"/"+currState+" State Log2FC Analysis, Genes output"
outputHistogramFolder = outputFolder+"/_Log2FC Gene Histograms Across All Conditions, Upregulated"
outputGeneExpressionFolder = outputFolder+"/Output - Upregulated "+currState+" State Genes, Log Gene Expression"
outputGenesVsParamsFolder = currStateDir+"/"+currState+" State Log2FC Analysis, Genes v kparams output"

allConditions_fileName = "_Mesenchymal State Marker Genes, All Conditions.csv"

# EMT Marker directory
emtMarkerDir = os.path.dirname(currStateDir)+"/Markers"

# Percent of cells with gene expression directory
nCellsFolder = os.path.dirname(currStateDir)+"/nCells"

# KParam correlation folders
k2Pos_Folder = outputGenesVsParamsFolder+"/_k2 vs Log2FC Gene, Positively Correlated"
k2Neg_Folder = outputGenesVsParamsFolder+"/_k2 vs Log2FC Gene, Negatively Correlated"
k2Pos_ParamCorr_Folder = k2Pos_Folder+"/_Gene Correlation Graphs, k2 Pos"
k2Neg_ParamCorr_Folder = k2Neg_Folder+"/_Gene Correlation Graphs, k2 Neg"

k2Pos_File = "Output - k2 Upregulated, I to M, Genes Speeding Transition Time.xlsx"
k2Neg_File = "Output - k2 Downregulated, I to M, Genes Slowing Transition Time.xlsx"

## Import DE Gene Names

In [25]:
# Import data

os.chdir(dataFolder)
DE_genes = pd.read_csv(open(allConditions_fileName,'r'))

In [26]:
# Each expt/condition with its list of DE genes

DE_genes.head(5)
# NaNs here are because there are not the same number of DE genes in each condition
# (using an adj-p-val cutoff, from DE genes calculation, cutting off at maximum # genes)

Unnamed: 0,"1-Pastushenko_Pastushenko, M","2-vanDijk_TGFb-day8_rep1, M","2-vanDijk_TGFb-day8_rep2, M","2-vanDijk_TGFb-day10_rep1, M","2-vanDijk_TGFb-day10_rep2, M","2-vanDijk_Zeb1_rep1, M","2-vanDijk_Zeb1_rep2, M","3-Cook_A549-TGFB1, M","3-Cook_DU145-TGFB1, M","3-Cook_OVCA420-EGF, M","3-Cook_OVCA420-TGFB1, M","3-Cook_OVCA420-TNF, M"
0,YAP1,COL1A1,COL1A1,VIM,COL1A1,COL1A1,COL1A1,TGFBI,UBC,LAMC2,LAMC2,TSPAN8
1,ANKRD1,VIM,FBN1,COL1A1,VIM,IGFBP4,IGFBP4,CCDC80,MDH1,TGM2,MT2A,ANXA4
2,9230110C19RIK,SERPINE1,FN1,COL4A1,COL4A1,VIM,VIM,NPC2,CCT5,TFPI2,TGM2,LCN2
3,BIRC2,SPARC,VIM,CDH2,SERPINE1,FN1,FN1,TPM1,HSP90B1,OCIAD2,GADD45A,MPC2
4,TMEM123,THBS1,S100A6,FBN1,SPARC,S100A6,CTSB,IGFBP7,HSPA5,LAMB3,OCIAD2,FTL


In [27]:
# Genes most frequently found in state

flattened_DE_genes = pd.Series(DE_genes.to_numpy().flatten())
DE_genes_union = set(flattened_DE_genes.dropna().unique())

DE_genes_frequency_counts = flattened_DE_genes.value_counts()
DE_genes_frequency_counts.head(10)

TIMP1       11
TGFBI       10
THBS1        9
MSN          9
CYR61        9
SERPINE1     9
CD59         9
SERPINE2     9
B2M          8
MYL6         8
dtype: int64

In [28]:
# # For a certain gene, check which conditions it is in
# currGene = DE_genes_frequency_counts.index[0]
# DE_genes_log2FC.isin([currGene]).any(0)

## Import DE Gene Log2FCs

In [29]:
# Import data
# For every single gene, have the Log2FC of information from each expt/condition
# Using Log2FC(I state / all other states), within same experiment, to compare between experiments

os.chdir(dataFolder)
all_genes_log2FC = pd.DataFrame([]) # Create df

# Fill df with gene Log2FC values from each condition
# (These values are already output from processing datasets)
inputFiles = sorted(glob.glob(currState+" State Marker Genes - *.csv"))
for inputFile in inputFiles:
    currCondition = pd.read_csv(inputFile, header=None, index_col=0)
    currCondition.index = currCondition.index.rename("Gene") # Index must have same name aross all expt conditions
    currCondition = currCondition.rename(columns={1: inputFile[33:-4]}) # Column name is expt condition
    # Merge the new column with imported conditions
    all_genes_log2FC = all_genes_log2FC.join(currCondition, how='outer')

In [30]:
# Change order of dataframe to be same as dataset order
all_genes_log2FC = all_genes_log2FC[['1-Pastushenko_Pastushenko, M',
                                     '2-vanDijk_TGFb-day8_rep1, M',
                                     '2-vanDijk_TGFb-day8_rep2, M',
                                     '2-vanDijk_TGFb-day10_rep1, M',
                                     '2-vanDijk_TGFb-day10_rep2, M',
                                     '2-vanDijk_Zeb1_rep1, M',
                                     '2-vanDijk_Zeb1_rep2, M',
                                     '3-Cook_A549-TGFB1, M',
                                     '3-Cook_DU145-TGFB1, M',
                                     '3-Cook_OVCA420-EGF, M',
                                     '3-Cook_OVCA420-TGFB1, M',
                                     '3-Cook_OVCA420-TNF, M']]

In [31]:
DE_genes_log2FC = all_genes_log2FC.loc[list(DE_genes_union)]

## Import % of Cells with Gene Expression

In [33]:
# Import data
# For every single gene, have the %cells information from each expt/condition
os.chdir(nCellsFolder)
all_genes_nCells = pd.DataFrame([]) # Create df

# Fill df with &cells value from each condition
# (These values are already output from processing datasets
inputFiles = sorted(glob.glob("*- percent of cells with gene expression.csv"))
for inputFile in inputFiles:
    currCondition = pd.read_csv(inputFile, header=None, index_col=0)
    currCondition.index = currCondition.index.rename("Gene") # Index must have same name aross all expt conditions
    currCondition = currCondition.rename(columns={1: inputFile[1:-44]}) # Column name is expt condition
    # Merge the new column with imported conditions
    all_genes_nCells = all_genes_nCells.join(currCondition, how='outer')
all_genes_nCells = all_genes_nCells[all_genes_nCells.index.notnull()]
all_genes_nCells = all_genes_nCells.apply(pd.to_numeric)

# Change order of dataframe to be same as dataset order
all_genes_nCells = all_genes_nCells[['1-Pastushenko_Pastushenko',
                                     '2-vanDijk_TGFb-day8_rep1',
                                     '2-vanDijk_TGFb-day8_rep2',
                                     '2-vanDijk_TGFb-day10_rep1',
                                     '2-vanDijk_TGFb-day10_rep2',
                                     '2-vanDijk_Zeb1_rep1',
                                     '2-vanDijk_Zeb1_rep2',
                                     '3-Cook_A549-TGFB1',
                                     '3-Cook_DU145-TGFB1',
                                     '3-Cook_OVCA420-EGF',
                                     '3-Cook_OVCA420-TGFB1',
                                     '3-Cook_OVCA420-TNF']]

# Same column names as DE_genes_log2FC
all_genes_nCells = all_genes_nCells.add_suffix(', M')

all_genes_nCells.drop(columns={'1-Pastushenko_Pastushenko, M'}, inplace=True)

## Filter DE Gene Log2FCs

In [34]:
# Only conditions with 1 int state (11 total)
oneIstate_DE_genes_log2FC_filtered = DE_genes_log2FC.drop(columns=DE_genes_log2FC.columns[0], axis=1)
oneIstate_DE_genes_log2FC_filtered = oneIstate_DE_genes_log2FC_filtered.copy().round(3)

# Filter - remove genes with too many 0 log2FC

# Per gene: # samples with non-zero and non-NaN counts
num_conditions = oneIstate_DE_genes_log2FC_filtered.shape[1]
numZeroes_per_gene = num_conditions - oneIstate_DE_genes_log2FC_filtered.apply(np.count_nonzero, axis=1)
numNAN_per_gene = oneIstate_DE_genes_log2FC_filtered.apply(np.isnan).apply(np.count_nonzero, axis=1)
numGoodSamples_per_gene = num_conditions - numZeroes_per_gene - numNAN_per_gene

minGoodSamples_per_gene = 7
testGoodSamples = numGoodSamples_per_gene >= minGoodSamples_per_gene

filtered_DE_genes_log2FC = oneIstate_DE_genes_log2FC_filtered.loc[testGoodSamples].copy()

In [35]:
# Filter condition 1
# Only keep genes that are *highly* upregulated in multiple Mesenchymal conditions
n_upregulated_conditions_1 = 1
upregulation_cutoff_1 = 0.58 # log2FC of 1.5
gene_filtering_criteria_1 = filtered_DE_genes_log2FC.apply(lambda s, n: s.nlargest(n)[-1] > upregulation_cutoff_1, axis=1, n=n_upregulated_conditions_1).copy()
# code inspo: https://stackoverflow.com/questions/34518634/finding-highest-values-in-each-row-in-a-data-frame-for-python
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_1].copy()



# Filter condition 2
# Only keep genes that have above-mentioned upregulation + expression in more than 5% of cells in those datasets

# Perc cell expression for current list of filtered DE genes
filtered_DE_genes_nCells = all_genes_nCells.filter(filtered_DE_genes_log2FC.index, axis=0)
# Two conditions: log2FC of 0.58 and percent cell gene expression of 5%
condition_log2FC = filtered_DE_genes_log2FC.applymap(lambda x: 5 if x > 0.58 else 0)
condition_nCells = filtered_DE_genes_nCells.applymap(lambda x: 1 if x > 0.05 else 0)
gene_filtering_criteria_2_df = condition_log2FC + condition_nCells

def all_high_log2FC_above_5percent_expression(currRow):
    n_log2FC_high = currRow[currRow >= 5.0].count()
    return currRow.nlargest(n_log2FC_high)[-1] == 6.0

gene_filtering_criteria_2 = gene_filtering_criteria_2_df.apply(lambda x: all_high_log2FC_above_5percent_expression(x), axis=1).copy()
# code inspo: https://stackoverflow.com/questions/34518634/finding-highest-values-in-each-row-in-a-data-frame-for-python
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_2].copy()



# Filter condition 3
# Only keep genes that are *moderately* upregulated in multiple conditions
n_upregulated_conditions_5 = 3
upregulation_cutoff_5 = 0.32 # log2FC of 1.25
gene_filtering_criteria_3 = filtered_DE_genes_log2FC.apply(lambda s, n: s.nlargest(n)[-1] > upregulation_cutoff_5, axis=1, n=n_upregulated_conditions_5).copy()
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_3].copy()


# Number of genes
len(filtered_DE_genes_log2FC.index)

909

## Import KParams; Correlate w Gene Log2FCs

In [36]:
# Folders which contain k parameters

currFolder = "ODE Model/Param Fits"

Import k1 and k2 parameters from each condition

In [37]:
# File names for conditions with 1 int state

# Dataset strings
dataset_strings = ["2-vanDijk", "3-Cook"]
# Condition strings
vanDijk_conditions = ["TGFb-day8_rep1", "TGFb-day8_rep2",
                      "TGFb-day10_rep1", "TGFb-day10_rep2",
                      "Zeb1_rep1", "Zeb1_rep2"]
cook_conditions = ["A549-TGFB1",
                   "DU145-TGFB1",
                   "OVCA420-EGF", "OVCA420-TGFB1", "OVCA420-TNF"]

# File strings
param_str = ", param results.csv"

vanDijk_files = []
for currCondition in vanDijk_conditions:
    vanDijk_files.append(dataset_strings[0]+" - "+currCondition+param_str)

cook_files_pseudotime = []
for currCondition in cook_conditions:
    cook_files_pseudotime.append(dataset_strings[1]+" - "+currCondition+param_str)
    
oneIstate_kparam_files = vanDijk_files + cook_files_pseudotime

In [38]:
# Import k params from files

oneIstate_k1_array = []
oneIstate_k2_array = []

os.chdir(currFolder)
for inputFile in oneIstate_kparam_files:
    currFile = pd.read_csv(inputFile)
    oneIstate_k1_array.append(currFile["k1"])
    oneIstate_k2_array.append(currFile["k2"])

In [41]:
from scipy.stats import gaussian_kde

In [42]:
def get_MAP(data):
    kde = gaussian_kde(data)
    # Define a grid of points where the density will be evaluated
    x_grid = np.linspace(min(data) - 1, max(data) + 1, 1000)
    # Evaluate the KDE on the grid
    kde_values = kde(x_grid)
    # Find the index of the maximum density (MAP value)
    map_index = np.argmax(kde_values)
    map_value = x_grid[map_index]
    return map_value

In [43]:
# maximum a posteriori values of kparams per condition

oneIstate_kparam_MAPs = pd.DataFrame([], columns=filtered_DE_genes_log2FC.columns)

for idx, currColumn in enumerate(oneIstate_kparam_MAPs.columns):
    curr_k1_MAP = get_MAP(oneIstate_k1_array[idx])
    curr_k2_MAP = get_MAP(oneIstate_k2_array[idx])
    oneIstate_kparam_MAPs[currColumn] = [curr_k1_MAP, curr_k2_MAP]
    
oneIstate_kparam_MAPs.index = pd.Index(["k1","k2"])
oneIstate_kparam_MAPs

Unnamed: 0,"2-vanDijk_TGFb-day8_rep1, M","2-vanDijk_TGFb-day8_rep2, M","2-vanDijk_TGFb-day10_rep1, M","2-vanDijk_TGFb-day10_rep2, M","2-vanDijk_Zeb1_rep1, M","2-vanDijk_Zeb1_rep2, M","3-Cook_A549-TGFB1, M","3-Cook_DU145-TGFB1, M","3-Cook_OVCA420-EGF, M","3-Cook_OVCA420-TGFB1, M","3-Cook_OVCA420-TNF, M"
k1,4.769408,6.448965,6.524424,2.312731,2.364699,2.597743,2.383154,1.504473,3.645603,5.057298,2.066455
k2,1.11176,1.402725,1.608853,1.874108,5.063793,6.400208,3.157336,1.569689,1.934629,1.859993,1.678451


Calculate correlations with each gene's Log2FC

In [45]:
# Calculate Spearman's correlation and keep pval < .05

from scipy.stats import spearmanr

spearmanr_k1_raw = filtered_DE_genes_log2FC.apply(lambda x: spearmanr(x, oneIstate_kparam_medians.iloc[0], nan_policy='omit'), axis=1)
spearmanr_k2_raw = filtered_DE_genes_log2FC.apply(lambda x: spearmanr(x, oneIstate_kparam_medians.iloc[1], nan_policy='omit'), axis=1)

spearman_k1 = pd.DataFrame.from_records(spearmanr_k1_raw, columns =['corr', 'pval'], index=spearmanr_k1_raw.index)
spearman_k2 = pd.DataFrame.from_records(spearmanr_k2_raw, columns =['corr', 'pval'], index=spearmanr_k2_raw.index)

print(spearman_k1[spearman_k1['pval'] < .05].sort_values('pval'))

oneIstate_corr_k1 = spearman_k1[spearman_k1['pval'] < .05]['corr']
oneIstate_corr_k2 = spearman_k2[spearman_k2['pval'] < .05]['corr']

             corr      pval
Gene                       
FKBP10   0.909091  0.000106
RYBP     0.890909  0.000233
AAED1    0.872727  0.000455
WDR54    0.854545  0.000807
EIF2AK4  0.845455  0.001045
...           ...       ...
TPM2     0.609091  0.046696
FAM65A   0.609091  0.046696
HDAC5    0.609091  0.046696
EMP3     0.609091  0.046696
PEA15    0.609091  0.046696

[111 rows x 2 columns]


In [46]:
# Calculate R^2 from correlation coefficient between kparam and gene log2FC
oneIstate_rSquared_k1 = np.square(oneIstate_corr_k1)
oneIstate_rSquared_k2 = np.square(oneIstate_corr_k2)

In [43]:
# Testing other cutoffs
cutoff_value = 0.0

### [3] 1 int state - k2 vs Log2FC - positive

In [50]:
# Only keep correlation values beyond cutoff
oneIstate_corr_k2_positiveCorr = oneIstate_corr_k2[oneIstate_corr_k2 > cutoff_value].sort_values(ascending=False)
len(oneIstate_corr_k2_positiveCorr)

25

### [4] 1 int state - k2 vs Log2FC - negative

In [51]:
# Only keep correlation values beyond cutoff
oneIstate_corr_k2_negativeCorr = oneIstate_corr_k2[oneIstate_corr_k2 < -cutoff_value].sort_values(ascending=True)
len(oneIstate_corr_k2_negativeCorr)

18

## Output Highlighted Lists of Genes

Import markers

In [43]:
# Specific markers
msigdbMarkersCsv = "Markers - MSigDB.csv"
empCookMarkersCsv = "Markers - EMP Cook 2021.csv"
panglaoMarkersCsv = "Markers - PanglaoDB, Epithelial.csv"

# Import MSigDB genes, EMP Cook genes, and PanglaoDB E genes
os.chdir(emtMarkerDir)
oneIstate_DE_genes = filtered_DE_genes_log2FC.index.to_list()

msigdbGenes = []
with open(msigdbMarkersCsv) as inputFile:
    reader = csv.reader(inputFile)
    # no header
    for row in reader:
        if row[0] in oneIstate_DE_genes:
            msigdbGenes.append(row[0])

empCookGenes = []
with open(empCookMarkersCsv) as inputFile:
    reader = csv.reader(inputFile)
    # no header
    for row in reader:
        if row[0] in oneIstate_DE_genes:
            empCookGenes.append(row[0])

panglaoGenes = []
with open(panglaoMarkersCsv) as inputFile:
    reader = csv.reader(inputFile)
    # no header
    for row in reader:
        if row[0] in oneIstate_DE_genes:
            panglaoGenes.append(row[0])

In [44]:
# Def to export gene lists colored by database

def export_gene_list(currGeneList, fileStringName):

    currGeneDataframe = pd.DataFrame(index=currGeneList)
    currGeneDataframe["EMT Hallmark from MSigDB"] = [currGene in msigdbGenes for currGene in currGeneList]
    currGeneDataframe["EMPlasticity Genes from Cook"] = [currGene in empCookGenes for currGene in currGeneList]
    currGeneDataframe["Mesenchymal Genes from Panglao"] = [currGene in panglaoGenes for currGene in currGeneList]
    currGeneDataframe = currGeneDataframe.replace({True: 'yes', False: 'no'})
    
    # if "E to I" in fileStringName:
    #     currGeneDataframe["R from k1"] = [round(oneIstate_corr_k1[currGene],4) for currGene in currGeneList]
    #     currGeneDataframe["R^2 from k1"] = [round(oneIstate_rSquared_k1[currGene],4) for currGene in currGeneList]
    if "I to M" in fileStringName:
        currGeneDataframe["R from k2"] = [round(oneIstate_corr_k2[currGene],4) for currGene in currGeneList]
        currGeneDataframe["R^2 from k2"] = [round(oneIstate_rSquared_k2[currGene],4) for currGene in currGeneList]
    currGeneDataframe = currGeneDataframe.sort_values(currGeneDataframe.columns[-1], ascending=False)

    os.chdir(outputGenesVsParamsFolder)
    currGeneDataframe.to_csv("Output - Temporary File, del.csv")
    
    # Use openpyxl to color the cells according to database
    wb = openpyxl.Workbook()
    ws = wb.active
    with open("Output - Temporary File, del.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            ws.append(row)

    # Highlight E genes (red) and M genes (blue / purple)
    for row in ws.iter_rows(min_col=1, min_row=2, max_col=2):
        for cell in row:
            if cell.value in msigdbGenes: # MSigDB = light blue
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="ABC9DE")
            elif cell.value in panglaoGenes: # PanglaoDB = light red
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="D98686")
            elif cell.value in empCookGenes: # EMP Cook = light purple
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="DABEED")

    wb.save(fileStringName)
    wb.close()
    
    os.remove("Output - Temporary File, del.csv")

In [45]:
# Output highlighted lists of genes
kparamCorr_geneLists = [#oneIstate_corr_k1_positiveCorr.index.to_list(),
                        # oneIstate_corr_k1_negativeCorr.index.to_list()],
                        oneIstate_corr_k2_positiveCorr.index.to_list(),
                        oneIstate_corr_k2_negativeCorr.index.to_list()]
kparamCorr_fileNames = [#k1Pos_File, k1Neg_File,
                        k2Pos_File, k2Neg_File]

for idx, currGeneList in enumerate(kparamCorr_geneLists):
    export_gene_list(currGeneList, kparamCorr_fileNames[idx])