###### Epithelial State DE Genes, Overlap Between Samples

Output: Epithelial State Log2FC Analysis, Genes output

In [1]:
import os
import csv
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import openpyxl
import shutil
import subprocess
import seaborn as sns

In [2]:
currStateDir = os.getcwd()

In [None]:
currStateDir

In [4]:
# File & folder names

currState = "Epithelial"

dataFolder = currStateDir+"/Data - Log2FC from Conditions & Genes"
outputFolder = currStateDir+"/"+currState+" State Log2FC Analysis, Genes output"
outputHistogramFolder = outputFolder+"/_Log2FC Gene Histograms Across All Conditions, Upregulated"
outputGeneExpressionFolder = outputFolder+"/Output - Upregulated "+currState+" State Genes, Log Gene Expression"

# EMT Marker directory
emtMarkerDir = os.path.dirname(currStateDir)+"/Markers"

# Percent of cells with gene expression directory
nCellsFolder = os.path.dirname(currStateDir)+"/nCells"

allConditions_fileName = "_"+currState+" State Marker Genes, All Conditions.csv"

## Import DE Gene Names

In [5]:
# Import data

os.chdir(dataFolder)
DE_genes = pd.read_csv(open(allConditions_fileName,'r'))

In [None]:
# Each expt/condition with its list of DE genes

DE_genes.head(5)
# NaNs here are because there are not the same number of DE genes in each condition
# (using an adj-p-val cutoff, from DE genes calculation, cutting off at maximum # genes)

In [7]:
# Genes most frequently found in state

flattened_DE_genes = pd.Series(DE_genes.to_numpy().flatten())
DE_genes_union = set(flattened_DE_genes.dropna().unique())

DE_genes_frequency_counts = flattened_DE_genes.value_counts()
DE_genes_frequency_counts.head(10)

PERP       9
KRT17      8
KRT6A      8
KRT14      8
CSTB       8
JUP        7
KRT5       7
FTH1       7
TACSTD2    7
FTL        7
dtype: int64

In [8]:
# # For a certain gene, check which conditions it is in
# currGene = DE_genes_frequency_counts.index[0]
# DE_genes_log2FC.isin([currGene]).any(0)

## Import DE Gene Log2FCs

In [9]:
# Import data
# For every single gene, have the Log2FC of information from each expt/condition
# Using Log2FC(I state / all other states), within same experiment, to compare between experiments

os.chdir(dataFolder)
all_genes_log2FC = pd.DataFrame([]) # Create df

# Fill df with gene Log2FC values from each condition
# (These values are already output from processing datasets)
inputFiles = sorted(glob.glob(currState+" State Marker Genes - *.csv"))
for inputFile in inputFiles:
    currCondition = pd.read_csv(inputFile, header=None, index_col=0)
    currCondition.index = currCondition.index.rename("Gene") # Index must have same name aross all expt conditions
    currCondition = currCondition.rename(columns={1: inputFile[32:-4]}) # Column name is expt condition
    # Merge the new column with imported conditions
    all_genes_log2FC = all_genes_log2FC.join(currCondition, how='outer')

In [10]:
# Change order of dataframe to be same as dataset order
all_genes_log2FC = all_genes_log2FC[['1-Pastushenko_Pastushenko, E',
                                     '2-vanDijk_TGFb-day8_rep1, E',
                                     '2-vanDijk_TGFb-day8_rep2, E',
                                     '2-vanDijk_TGFb-day10_rep1, E',
                                     '2-vanDijk_TGFb-day10_rep2, E',
                                     '2-vanDijk_Zeb1_rep1, E',
                                     '2-vanDijk_Zeb1_rep2, E',
                                     '3-Cook_A549-TGFB1, E',
                                     '3-Cook_DU145-TGFB1, E',
                                     '3-Cook_OVCA420-EGF, E',
                                     '3-Cook_OVCA420-TGFB1, E',
                                     '3-Cook_OVCA420-TNF, E']]

In [11]:
DE_genes_log2FC = all_genes_log2FC.loc[list(DE_genes_union)]

## Import % of Cells with Gene Expression

In [12]:
# Import data
# For every single gene, have the %cells information from each expt/condition
os.chdir(nCellsFolder)
all_genes_nCells = pd.DataFrame([]) # Create df

# Fill df with %cells value from each condition
# (These values are already output from scanpy code)
inputFiles = sorted(glob.glob("*- percent of cells with gene expression.csv"))
for inputFile in inputFiles:
    currCondition = pd.read_csv(inputFile, header=None, index_col=0)
    currCondition.index = currCondition.index.rename("Gene") # Index must have same name aross all expt conditions
    currCondition = currCondition.rename(columns={1: inputFile[1:-44]}) # Column name is expt condition
    # Merge the new column with imported conditions
    all_genes_nCells = all_genes_nCells.join(currCondition, how='outer')
all_genes_nCells = all_genes_nCells[all_genes_nCells.index.notnull()]
all_genes_nCells = all_genes_nCells.apply(pd.to_numeric)

# Change order of dataframe to be same as dataset order
all_genes_nCells = all_genes_nCells[['1-Pastushenko_Pastushenko',
                                     '2-vanDijk_TGFb-day8_rep1',
                                     '2-vanDijk_TGFb-day8_rep2',
                                     '2-vanDijk_TGFb-day10_rep1',
                                     '2-vanDijk_TGFb-day10_rep2',
                                     '2-vanDijk_Zeb1_rep1',
                                     '2-vanDijk_Zeb1_rep2',
                                     '3-Cook_A549-TGFB1',
                                     '3-Cook_DU145-TGFB1',
                                     '3-Cook_OVCA420-EGF',
                                     '3-Cook_OVCA420-TGFB1',
                                     '3-Cook_OVCA420-TNF']]

# Same column names as DE_genes_log2FC
all_genes_nCells = all_genes_nCells.add_suffix(', E')

## Filter DE Gene Log2FCs

In [13]:
# Pre-filtering
# Filter genes to have at least certain # conditions that express a gene
min_num_conditions = 5
filtered_DE_genes_log2FC = DE_genes_log2FC.dropna(axis='rows', thresh=min_num_conditions)

# Filter condition 1
# Only keep genes that are *highly* upregulated in multiple epithelial conditions
n_upregulated_conditions_1 = 5
upregulation_cutoff_1 = 0.58 # log2FC of 1.5
gene_filtering_criteria_1 = filtered_DE_genes_log2FC.apply(lambda s, n: s.nlargest(n)[-1] > upregulation_cutoff_1, axis=1, n=n_upregulated_conditions_1).copy()
# code inspo: https://stackoverflow.com/questions/34518634/finding-highest-values-in-each-row-in-a-data-frame-for-python
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_1].copy()



# # Why have multiple filter conditions?
# # There are 4 similar TGFb conditions in van Dijk - they are often up-regulated / down-regulated together

# Filter condition 2
# At least two highly upregulated conditions must not be in vanDijk-TGFb

# Only Cook & van Dijk Zeb1 & Pastushenko samples
without_vanDijkTGFb_cols = [currColumn for currColumn in filtered_DE_genes_log2FC.columns if "vanDijk" not in currColumn]
without_vanDijkTGFb = filtered_DE_genes_log2FC[without_vanDijkTGFb_cols]

# Filter
n_upregulated_conditions_2 = 2
upregulation_cutoff_2 = 0.58
gene_filtering_criteria_2 = without_vanDijkTGFb.apply(lambda s, n: s.nlargest(n)[-1] > upregulation_cutoff_2, axis=1, n=n_upregulated_conditions_2).copy()
without_vanDijkTGFb = without_vanDijkTGFb.loc[gene_filtering_criteria_2].copy()
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[without_vanDijkTGFb.index.to_list()]



# Filter condition 3
# Only keep genes that have above-mentioned upregulation + expression in more than 5% of cells in those datasets

# Perc cell expression for current list of filtered DE genes
filtered_DE_genes_nCells = all_genes_nCells.filter(filtered_DE_genes_log2FC.index, axis=0)
# Two conditions: log2FC of 0.58 and percent cell gene expression of 5%
condition_log2FC = filtered_DE_genes_log2FC.applymap(lambda x: 1 if x > 0.58 else 0)
condition_nCells = filtered_DE_genes_nCells.applymap(lambda x: 1 if x > 0.05 else 0)
gene_filtering_criteria_3_df = condition_log2FC + condition_nCells
gene_filtering_criteria_3 = gene_filtering_criteria_3_df.apply(lambda s, n: s.nlargest(n)[-1] == 2.0, axis=1, n=n_upregulated_conditions_1).copy()
# code inspo: https://stackoverflow.com/questions/34518634/finding-highest-values-in-each-row-in-a-data-frame-for-python
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_3].copy()



# Filter condition 4
# Only keep genes that are not highly downregulated in multiple conditions
n_downregulated_conditions = 3
downregulation_cutoff = -0.58 # log2FC of 1.5
gene_filtering_criteria_4 = filtered_DE_genes_log2FC.apply(lambda s, n: s.nsmallest(n)[-1] > downregulation_cutoff, axis=1, n=n_downregulated_conditions).copy()
# code inspo: https://stackoverflow.com/questions/34518634/finding-highest-values-in-each-row-in-a-data-frame-for-python
filtered_DE_genes_log2FC = filtered_DE_genes_log2FC.loc[gene_filtering_criteria_4].copy()

In [14]:
# Overall: 5 total samples upregulated, and at least 2 must be non van Dijk TGFb
len(filtered_DE_genes_log2FC.index)

109

In [None]:
filtered_DE_genes_log2FC.index

Index(['BAG1', 'NQO2', 'SQSTM1', 'DSC2', 'ADIRF', 'VAMP8', 'MAPK13', 'AGPAT2',
       'VPS37B', 'DDIT4',
       ...
       'ADI1', 'KIF13A', 'TMEM238', 'IGFBP3', 'SERPINB1', 'CLDND1', 'TMEM132A',
       'CLTB', 'FTL', 'THEM6'],
      dtype='object', name='Gene', length=109)