In [None]:
import pandas as pd
import numpy as np
import os
import sys

%load_ext autoreload
%autoreload 2

module_path = os.path.abspath(os.path.join('../src'))
sys.path.insert(0, module_path)

from utils import aggregate_df, plotPCA

import mygene
from combat.pycombat import pycombat

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import pickle
import xlsxwriter

from placentafiles import *
ourdataoriginal

In [None]:
studydf = pd.read_csv(studydfpath, sep='\t')
'''
studydf is supplementary data from: 
Liu, Y., Fan, X., Wang, R. et al. Single-cell RNA-seq reveals the diversity of trophoblast subtypes and patterns of differentiation in the human placenta . Cell Res 28, 819–832 (2018). https://doi.org/10.1038/s41422-018-0066-y

They write:
In the present study, we isolated human villous stromal cells (STRs), CTBs, the STB, and EVTs 
during the first and second trimesters of pregnancy and monitored the transcriptome dynamics of 1567 cells at 
single-cell resolution. [...] Four populations of cells from the villi at 8 weeks of pregnancy and one 
population from the decidua at 24 weeks of pregnancy were harvested. We named these five populations of 
cells as EVT_8W (HLA-G+, mononucleated, smaller size), STB_8W (HLA-Glow, multinucleated, larger size), 
CTB_8W (CDH1+ and HLA-G-), STR_8W (HLA-G- and CDH1−), and EVT_24W (HLA-G+)
'''
# parse a cell sample into the cell line that was used in this study by Liu et al. (GSE89497)
# ex: HE24W_EVT_sc1 will result in HE24W_EVT. This groups the cells into the original 5 cell lines as described in the study (GSE89497)
def parse_placenta_cells(name):
    ids = name.split("_")
    return (ids[0]+"_"+ids[1]).strip("1234")
    

# create a dictionary
# keys: 5 original cell lines
# values: a list of all the cell samples associated with the cell line
def group_types(lst, parse_cellLine):
    cell_types = dict()
    for cell in lst: 
        typ = parse_cellLine(cell)
        if typ not in cell_types:
            cell_types[typ] = [cell]
        else:
            cell_types[typ].append(cell)
    return cell_types

# the initial file from Liu et al.'s paper abbove
studydf

In [None]:
expression_data = studydf.copy().drop(columns=["gene_id", "gene_name", "Geneid"], errors='ignore')
expression_data = expression_data[~(expression_data == 0).all(axis=1)] # drop rows with all zeros 
expression_data = expression_data[~(pd.isna(expression_data)).any(axis=1)] # drop rows with any NaN 
expression_data_T = expression_data.T


scaler = StandardScaler()
expression_scaled = scaler.fit_transform(expression_data_T)

pca = PCA(n_components=2)  
pca_result = pca.fit_transform(expression_scaled)


pca_df = pd.DataFrame(pca_result, columns=['PC1', 'PC2'])
pca_df["Sample"] = expression_data_T.index
pca_df["Group"] = pca_df["Sample"].apply(lambda x: x.split("_")[0]+x.split("_")[1].strip("1234")) # this is the line that splits them into their cell lines
# specific to this placenta study

plt.figure(figsize=(10, 6))
sns.scatterplot(x='PC1', y='PC2',
                hue="Group",  
                data=pca_df,
                palette="Set2",  
                )
 
plt.title("placenta samples")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# show the original 5 cell lines as described in the Liu et al. paper
col_list = list(studydf)
placentatypes = group_types(col_list, parse_placenta_cells)
if not(os.path.exists(placentatypespath)):
    filehandler = open(placentatypespath, 'wb')
    pickle.dump(placentatypes, filehandler)
    filehandler.close()
    
file = open(placentatypespath,'rb')
placentatypes = pickle.load(file)
file.close()
    
print("There are "+ str(len(placentatypes)) + " cell types in this dataset")
placentatypes.keys()

In [None]:
class HGNCConvert:
    def __init__(self):
        self.mg = mygene.MyGeneInfo()
        self.unmatched = []

    def geneconv(self, g):
        k = self.mg.getgene(g.split('.')[0], fields = 'symbol')
        if isinstance(k, list):
            print(g)
            print(k)
            self.unmatched.append(k)
            return 0
        if k is not None and 'symbol' in list(k.keys()):
            return k['symbol']
        else: 
            self.unmatched.append(k)
            return 0
        
    def reset(self):
        self.unmatched = []


In [None]:
# use mygene to transform the ensembl IDs from our data to HGNC
# merge with the studydf
# log transform all the data
# placentadf is produced
ensembl_to_HGNC = HGNCConvert()


placentacolumns = []
for t in ['HE24W_EVT', 'HE8W_EVT', 'HE8W_STB', 'HE8W_STR', 'HE8W_CTB']: # HE8W_CTB will be excluded later on
    placentacolumns = placentacolumns + [c for c in placentatypes[t]]

if not (os.path.exists(placentadfpath)):
    # this includes the whole process of generating the placentadf...
    print('generating placentadf')

    ensembl_to_symbol = {ens_id: ensembl_to_HGNC.geneconv(ens_id) for ens_id in ourdataoriginal['Geneid'].unique()}
    ourdataoriginal['Gene Symbol'] = ourdataoriginal['Geneid'].map(ensembl_to_symbol)
    ourdataoriginal = ourdataoriginal[ourdataoriginal['Gene Symbol'].notna()]
    studydf = studydf.reset_index().rename(columns={'index':'Gene Symbol'})
    # takes 130 m to run

    ourdata_grouped = ourdataoriginal.copy().groupby('Gene Symbol').max()
    studydf_grouped = studydf.copy().groupby('Gene Symbol').max()

    PABCdf = pd.merge(ourdata_grouped, studydf_grouped, how='inner', on='Gene Symbol').reset_index()

    print('merging studydf and ourdataoriginal on Gene Symbol')
    placentadf = pd.merge(ourdataoriginal, studydf, on='Gene Symbol', how='inner')

    print('log transforming all data')
    placentadf[placentacolumns+ourdataTPMcolumns] = placentadf[placentacolumns+ourdataTPMcolumns].apply(lambda x: np.log2(x + 1)) 
    
    filehandler = open(placentadfpath, "wb")
    pickle.dump(placentadf, filehandler)
    filehandler.close()

file = open(placentadfpath,'rb')
placentadf = pickle.load(file)
file.close()


In [None]:
# adapted from the pyCombat documentation:
df_expression = placentadf.copy()

# Set index
df_expression.set_index(['Gene Symbol', 'Geneid'], inplace=True)
# Follow same correction steps...
# Save original index for later
original_index = df_expression.index

dataset1 = placentadf.copy()[placentacolumns]
dataset2 = placentadf.copy()[ourdataTPMcolumns]

print(len(df_expression.columns), len(dataset1.columns), len(dataset2.columns))
print(len(df_expression.columns) == len(dataset1.columns) + len(dataset2.columns))

# we generate the list of batches
batch = []
datasets = [dataset1, dataset2]
for j in range(len(datasets)):
    batch.extend([j for _ in range(len(datasets[j].columns))])

# Drop 0-variance
zero_var_mask = df_expression.var(axis=1) == 0
df_zero_var = df_expression[zero_var_mask]
df_variable = df_expression[~zero_var_mask]

# run pyComBat
df_corrected_var = pycombat(df_variable,batch)


df_corrected = pd.concat([df_corrected_var, df_zero_var], axis=0)
df_corrected = df_corrected.loc[original_index]

# Add gene names back as a column 
df_corrected.reset_index(inplace=True)
#df_corrected.rename(columns={'index': 'Gene'}, inplace=True)


In [None]:
plotPCA(df_corrected[ourdataTPMcolumns+placentacolumns], "placenta transcriptome profiles VS our cell condition profiles")

In [None]:
placentagenes = ["ACVR1","ADAM8","ASCL2","BCAM","CDH1","COL9A2","CSH1","DAG1","ENG","FLNA",
                 "FSCN1","FSTL3","GATA2","HGS","HSPG2","ITGB4","JUP","MMP2","PAK4","PIGF","PKP3","PLEC",
                 "RRM2","SLC19A1","SLC1A5","SLC2A8","SLC38A10","SLC43A1","SLC43A2","SLC46A1","SLC7A5",
                 "SNAI1","VASP","VAV2","ZYX"]

samplesdict = {"HE24W_EVT": placentatypes['HE24W_EVT'],
               "HE8W_EVT": placentatypes['HE8W_EVT'],
               "HE8W_STB": placentatypes['HE8W_STB'],
               "HE8W_STR": placentatypes['HE8W_STR'],
               "HE8W_CTB": placentatypes['HE8W_CTB'],
               "D-Bewo-M":["D-Bewo-M1","D-Bewo-M2", "D-Bewo-M3"], 
               "D-Bewo-S":["D-Bewo-S1", "D-Bewo-S2", "D-Bewo-S3"], }

functionsdict = {"mean":'mean', "STD":np.std}
columnstokeepdict = {"Gene Symbol": "Gene Symbol",
                     'p-value for HE8W_EVT vs D-Bewo-CT-M':'p-value for HE8W_EVT vs D-Bewo-CT-M',
                     'p-value for HE8W_EVT vs D-Bewo-CT-S':'p-value for HE8W_EVT vs D-Bewo-CT-S'}

def significance(row, populationcol, popmeancol):
    data = row[populationcol].astype(float).values  
    popmean = row[popmeancol].astype(float).mean()
    pvalue = stats.ttest_1samp(a=data, popmean=popmean)
    return pvalue[1]

Mphasecomparison = {"patients": placentatypes['HE8W_EVT'], "cellcondition":["D-Bewo-CT-M1", "D-Bewo-CT-M2", "D-Bewo-CT-M3"]}
Sphasecomparison = {"patients": placentatypes['HE8W_EVT'], "cellcondition":["D-Bewo-CT-S1", "D-Bewo-CT-S2", "D-Bewo-CT-S3"]}


placentadflook = df_corrected[df_corrected['Gene Symbol'].isin(placentagenes)].copy().groupby('Gene Symbol').max()

placentadflook['p-value for HE8W_EVT vs D-Bewo-CT-M'] = placentadflook.apply(lambda row: 
                                          significance(row, Mphasecomparison["patients"], 
                                                            Mphasecomparison["cellcondition"]), 
                                                            axis=1)
placentadflook['p-value for HE8W_EVT vs D-Bewo-CT-S'] = placentadflook.apply(lambda row: 
                                          significance(row, Sphasecomparison["patients"], 
                                                            Sphasecomparison["cellcondition"]), 
                                                            axis=1)

placentadflook = placentadflook.reset_index()
aggregate_df(placentadflook, samplesdict, functionsdict, columnstokeepdict).set_index('Gene Symbol')

In [None]:
data = aggregate_df(placentadflook, samplesdict, functionsdict, {'Gene Symbol':'Gene Symbol'}).set_index('Gene Symbol')

curresultspath = ""
writer = pd.ExcelWriter(curresultspath, engine='xlsxwriter')

for gene in data.index:
    df = data[data.index == gene]
    df.to_excel(writer, sheet_name=gene)

writer.close()
