# MAD1 PBMC-scRNAseq data

@mmm: November 26, 2021 - January 9, 2022 - February 6, 2022  

### Data with combined inferCNV analysis on L3 cell classifications

In [None]:
import os
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib.pyplot import rc_context
from matplotlib import rcParams

import seaborn as sns

import scanpy as sc
import gseapy as gspy

In [None]:
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
sc.settings.set_figure_params(figsize=(4, 4))
sc.logging.print_header()

In [None]:
# Directories
DATA = "/Users/mmm/BioPROJECTS/MAD1 & MVA/scRNAseq PBMCs/data/"
DESKTOP = "/Users/mmm/Desktop/"
sc.settings.figdir = "/Users/mmm/Desktop"

## Load data and metadata

In [None]:
adata = sc.read_h5ad(DATA + "220207_MAD1_scRNAseq.h5ad")

In [None]:
cmd = pd.read_csv(DATA + "220304_cell_metadata.tsv", sep="\t", low_memory=False)
cmd = cmd.set_index("Cell")
adata.obs = cmd
adata

### Lists and Subsets

In [None]:
# Lists
sample_list = ["Control1", "Control2", "Father", "Mother", "Proband"]
L1 = ["CD4 T", "CD8 T", "NK", "other T", "B", "Mono", "DC", "other"]
L2 = ["CD4 TEM", "CD4 TCM", "CD8 Naive", "CD8 TEM", "CD8 TCM", "NK", "NK Proliferating", 
      "NK_CD56bright", "dnT", "gdT", "Treg", "MAIT", "B intermediate", "B memory", "B naive", 
      "CD14 Mono", "CD16 Mono", "cDC2", "pDC", "HSPC", "ILC", "Platelet"]

# Samples
ad_proband = adata[adata.obs["Sample"]=="Proband"]
ad_controls = adata[adata.obs["Sample2"]=="Control"]

# Cell types
ad_B_cells = adata[adata.obs["predicted.celltype.l1"]=="B"]
ad_otherT = adata[adata.obs["predicted.celltype.l1"]=="other T"]

# Aneuploid
aneu = adata[adata.obs["Aneu.l3"]== "Aneuploid"]
aneu_proband = aneu[aneu.obs["Sample"]=="Proband"]

# Remove rare populations

# Menu  <a clmass="anchor" id="menu"></a>
### - [1. Statistics](#statistics)
### - [2. Gene and score plots](#plots)
### - [3. Analysis of Aneuploidy](#aneuploidy)
### - [4. Differential expression](#diffexp)
### - [5. Pathway Plots](#pathways)
### - [6. Tumor signatures](#tumor)

## 1. Some statistics <a class="anchor" id="statistics"></a>
[Back to Menu](#menu)

### Cells per Sample

In [None]:
samples = adata.obs["Sample"].value_counts()
samples = pd.DataFrame(samples)
samples = samples.rename(columns={"Sample": "Values"})
samples.reset_index(inplace=True)
samples

In [None]:
sns.catplot(x="index", y="Values", data=samples, kind="bar", order=sample_list,
           height=3, aspect=1.5)

In [None]:
sc.pl.umap(adata, color="Sample", s=5)

In [None]:
sc.pl.umap(adata, color="Sample", s=5, groups="Proband")

In [None]:
sc.pl.umap(adata, color=["predicted.celltype.l1", "predicted.celltype.l2"],
          ncols=1)

In [None]:
sc.pl.umap(adata, color=["predicted.celltype.l3"])

## 2. Cell type markers - Gene, scores and Plots<a class="anchor" id="plots"></a>
[Back to Menu](#menu)

In [None]:
# T-cells
sc.pl.umap(adata, color=["CD3D", "CD8A", "NKG7", "TRDC", ])

In [None]:
# B-cells
sc.pl.umap(adata, color=["CD79A", "MS4A1", "BANK1", "IGKC", ])

In [None]:
# Myeloid
sc.pl.umap(adata, color=["CST3", "LYZ", "CD14", "FCER1A", ]) 
#FCGR3A is CD16

### Abundance of cell types

In [None]:
# L1
l1_per_sample = adata.obs.groupby("Sample")["predicted.celltype.l1"].value_counts(normalize=True)
l1_per_sample = pd.DataFrame(l1_per_sample)
l1_per_sample = l1_per_sample.rename(columns={"predicted.celltype.l1": "Values"})
l1_per_sample.reset_index(inplace=True)
l1_per_sample.head(3)

In [None]:
sns.catplot(x="predicted.celltype.l1", y="Values", data=l1_per_sample, hue="Sample", kind="bar",
           height=4, aspect=1.5)

In [None]:
# L2
l2_per_sample = adata.obs.groupby("Sample")["predicted.celltype.l2"].value_counts(normalize=True)
l2_per_sample = pd.DataFrame(l2_per_sample)
l2_per_sample = l2_per_sample.rename(columns={"predicted.celltype.l2": "Values"})
l2_per_sample.reset_index(inplace=True)

sns.catplot(y="predicted.celltype.l2", x="Values", data=l2_per_sample, hue="Sample", kind="bar",
           height=8, aspect=0.7)

In [None]:
# Only in other T
l2_per_sample_otherT = otherT.obs.groupby("Sample")["predicted.celltype.l2"].value_counts()   #normalize=True
l2_per_sample_otherT = pd.DataFrame(l2_per_sample_otherT)
l2_per_sample_otherT = l2_per_sample_otherT.rename(columns={"predicted.celltype.l2": "Values"})
l2_per_sample_otherT.reset_index(inplace=True)

sns.catplot(y="predicted.celltype.l2", x="Values", data=l2_per_sample_otherT, hue="Sample", 
            kind="bar", height=4, aspect=1
            )

In [None]:
# L3
l3_per_sample = adata.obs.groupby("Sample")["predicted.celltype.l3"].value_counts(normalize=True)
l3_per_sample = pd.DataFrame(l3_per_sample)
l3_per_sample = l3_per_sample.rename(columns={"predicted.celltype.l3": "Values"})
l3_per_sample.reset_index(inplace=True)

sns.catplot(y="predicted.celltype.l3", x="Values", data=l3_per_sample, hue="Sample", kind="bar",
           height=14, aspect=0.4)

#### gdT and intermediate kappa B cells
- https://azimuth.hubmapconsortium.org/references/#Human%20-%20PBMC

In [None]:
gdT_list = ["TRDC", "TRGC1", "TRGC2", "KLRC1", "NKG7", "TRDV2", "CD7", "TRGV9", "KLRD1", "KLRG1"]
gdT_list2 = ["TRDC", "TRGC1", "TRGC2", "NKG7", "TRDV2", "KLRD1", "KLRG1"]
gdT_2_list = ["KLRC2", "CD3G", "KIR3DL2", "CD3D", "TRDC", "TRDV1", "ZNF683", "KLRC1", 
              "TRGC1", "GZMH"]
gdT_4_list = ["TRDC", "TIGIT", "KLRC2", "TRGC2", "IKZF2", "GCSAM", "FCRL6", "TRDV1", "CST7", "CMC1"]
intkappa_B_list = ["MS4A1", "IGKC", "IGHM", "LINC01857", "MARCKS", "IGHD", "TNFRSF13B", "CD24",
                   "FCRL2", "BANK1"]
intkappa_B_list2 = ["IGKC", "IGHM", "LINC01857", "MARCKS", ]

In [None]:
sc.pl.dotplot(ad_otherT, gdT_list, 'Sample', standard_scale="var")  
sc.pl.dotplot(ad_otherT, gdT_list2, 'Sample', standard_scale="var") 

In [None]:
sc.pl.dotplot(ad_otherT, gdT_2_list, 'Sample', standard_scale="var")
sc.pl.dotplot(ad_otherT, gdT_4_list, 'Sample', standard_scale="var")  

In [None]:
gdT_2_4_list = {"Delta-gamma chains": ["TRGC1", "TRGC2", "TRDV1", "TRDV2", "TRDC"], 
                "CD3 chains": ["CD3G", "CD3D",],
                "Cytotoxicity": ["KLRC2", "KLRD1", "KLRG1", "GZMH", "FCRL6", "CST7",], 
                "Other gdT markers": ["TIGIT", "IKZF2", "CMC1"],
               }

ss = {"TEST": ["CD27", "CD28"]}

sc.pl.dotplot(ad_otherT, gdT_2_4_list, 'Sample', standard_scale="var") 
sc.pl.dotplot(ad_otherT, ss, 'Sample', ) 
sc.pl.heatmap(ad_otherT, gdT_2_4_list, groupby="Sample", cmap="Reds")

In [None]:
sc.pl.heatmap(ad_otherT, gdT_2_4_list, groupby="CleanAneu.l3", cmap="coolwarm")

In [None]:
sc.pl.dotplot(adata, ["PDCD1", "CD274", "CTLA4"], groupby="Sample", standard_scale="var")  

In [None]:
CD4cells = adata[adata.obs["predicted.celltype.l1"]=="CD4 T"]
sc.pl.dotplot(CD4cells, ["PDCD1", "CD274", "CTLA4"], groupby="Sample", standard_scale="var")  


In [None]:
sc.pl.dotplot(B_cells, intkappa_B_list, 'Sample', standard_scale="var")  

In [None]:
sc.pl.dotplot(B_cells, intkappa_B_list2, 'Sample', standard_scale="var")  

In [None]:
sc.pl.umap(adata, color=["IGKC", "Aneu.l3", "Sample", ], vmax=4.8, s=5)

In [None]:
sc.pl.umap(adata, color=["ESAM", "Aneu.l3", "Sample", ], vmax=3, s=5)

In [None]:
sc.pl.dotplot(adata, "ESAM", groupby="predicted.celltype.l3", standard_scale="var", swap_axes=True)

In [None]:
noPlat = adata[adata.obs["predicted.celltype.l3"]!="Platelet"]
sc.pl.dotplot(noPlat, "ESAM", groupby="predicted.celltype.l3", swap_axes=True)

In [None]:
sc.pl.dotplot(adata, "ESAM", groupby="Sample", standard_scale="var")

In [None]:
### Cytokines
# not found: "CCL1", "CXCL11", "CXCL12", 
cytokines = [ "CCL5", "ICAM1", "IL1RN", "IL13", "MIF", "SERPINE1"]
sc.pl.dotplot(adata, cytokines, groupby="Sample2", standard_scale="var",)

In [None]:
# MHC Class II
sc.pl.umap(ad_proband, color=["HLA-DRB5", "HLA-DQA2", "HLA-B"])

In [None]:
# MHC Class II
sc.pl.umap(adata, color=["predicted.celltype.l1"])

## 3. Analysis of aneuploidy <a class="anchor" id="aneuploidy"></a>
[Back to Menu](#menu)

In [None]:
sc.pl.umap(adata, color=["Sample2_Aneu.l3"], groups=["Control_Aneuploid"], palette="Reds",
          #wspace=0.4,
          )

In [None]:
sc.pl.umap(adata, color=["Sample2_Aneu.l3"], groups=["Mother_Aneuploid"], palette="Blues",
          #wspace=0.4,
          )

In [None]:
sc.pl.umap(adata, color=["Aneu.l3"], groups=["Aneuploid"],
          #wspace=0.4,
          )

In [None]:
sc.pl.umap(ad_proband, color=["CleanAneu.l3"], groups=["CleanAneu"], size=5)

In [None]:
ad_proband.obs["CleanAneu.l3"].value_counts()

In [None]:
sc.pl.umap(adata, color=["Aneu.l1", "Aneu.l2", "Aneu.l3"],
          wspace=0.4)

In [None]:
sc.pl.umap(adata, color=["Aneu.l1", "Aneu.l2", "Aneu.l3"], groups=["Aneuploid"],
          wspace=0.4)

### All cells

In [None]:
g = sns.catplot(x="Sample", data=adata.obs, 
                hue="Aneu.l1", hue_order=["Euploid", "Aneuploid"], kind="count",
                height=4, aspect=1.2)

In [None]:
g = sns.catplot(x="Sample", data=adata.obs, 
                hue="Aneu.l2", hue_order=["Euploid", "Aneuploid"], kind="count",
                height=4, aspect=1.2)

In [None]:
x,y = 'Sample', 'Aneu.l1'

(adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.2))

In [None]:
adata.obs.groupby(x)[y].value_counts(normalize=True)

In [None]:
x,y = 'Sample', 'Aneu.l2'

(adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.2))

In [None]:
adata.obs.groupby(x)[y].value_counts(normalize=True)

In [None]:
x,y = 'Sample', 'Aneu.l3'

(adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Aneuploid"], height=4, aspect=0.7))
plt.ylim(0,100)
plt.xticks(rotation=90)
plt.show()

In [None]:
adata.obs.groupby(x)[y].value_counts(normalize=True)

In [None]:
x,y = 'predicted.celltype.l1', 'Aneu.l3'

(adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.5))

In [None]:
# ONly in the proband
x,y = 'predicted.celltype.l1', 'Aneu.l3'

(proband.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.5))

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'

(adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                               height=3, aspect=8))


In [None]:
percent = (adata.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, kind="bar", height=2.5, aspect=4,
               hue_order=["Euploid", "Aneuploid"],)
plt.xticks(rotation=90)
plt.show()

In [None]:
# ONly in the proband
percent = (proband.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, kind="bar", height=2.5, aspect=4,
               hue_order=["Euploid", "Aneuploid"],)
plt.xticks(rotation=90)
plt.show()

In [None]:
# ONly in the proband

x,y = 'predicted.celltype.l3', 'Aneu.l3'

percent = (proband.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, kind="bar", height=2.5, aspect=6,
               hue_order=["Euploid", "Aneuploid"],)
plt.xticks(rotation=90)
plt.show()

### Compare Controls vs Proband

In [None]:
# Remove ASDC (no ASDC cells in the proband)
df = pd.read_csv(DATA + "211126_cell_metadata_noASDC.tsv", sep="\t")
print(len(adata.obs["predicted.celltype.l2"].unique()))
print(len(df["predicted.celltype.l2"].unique()))

In [None]:
controls = df[(df["Sample"]=="Control1") | (df["Sample"]=="Control2")]
proband = df[df["Sample"]=="Proband"]
print(controls.shape)
print(proband.shape)
print(len(controls["predicted.celltype.l2"].unique()))
print(len(proband["predicted.celltype.l2"].unique()))

In [None]:
x,y = 'predicted.celltype.l1', 'Aneu.l3'

(controls.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.5))

In [None]:
x,y = 'predicted.celltype.l1', 'Aneu.l3'

(proband.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index()
                          .pipe((sns.catplot,'data'), x=x,y='percent',hue=y, kind='bar',
                                hue_order=["Euploid", "Aneuploid"], height=4, aspect=1.5))

In [None]:
x,y = 'predicted.celltype.l1', 'Aneu.l3'

percent_controls = (controls.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_controls, hue=y, hue_order=["Aneuploid"], 
                kind="bar", height=3, aspect=1.5)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l1', 'Aneu.l3'

percent_controls = (proband.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_controls, hue=y, hue_order=["Aneuploid"], 
                kind="bar", height=3, aspect=1.5)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent_controls = (controls.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_controls, hue=y, kind="bar", height=3, aspect=3)
plt.xticks(rotation=90)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent_controls = (controls.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_controls, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=3)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent_proband = (proband.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_proband, hue=y, hue_order=["Euploid", "Aneuploid"],
                kind="bar", height=3, aspect=3)
plt.xticks(rotation=90)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent_proband = (proband.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent_proband, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=3)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
controls.groupby(x)[y].value_counts()

In [None]:
x,y = 'predicted.celltype.l3', 'Aneu.l3'
df = controls

percent = (df.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l3', 'Aneu.l3'
df = proband

percent = (df.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
proband.groupby(x)[y].value_counts(normalize=True)

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
proband.groupby(x)[y].value_counts()

## Clean celltypes - Remove low represented populations in the proband (n>10)

In [None]:
print(ad_proband.shape)
print(ad_controls.shape)

In [None]:
# Celltypes to remove
f_ASDC = adata.obs["predicted.celltype.l2"]!= "ASDC"
f_CD4_Naive = adata.obs["predicted.celltype.l2"]!= "CD4 Naive"
f_CD4_Proliferating = adata.obs["predicted.celltype.l2"]!= "CD4 Proliferating"
f_CD8_Proliferating = adata.obs["predicted.celltype.l2"]!= "CD8 Proliferating"
f_CD8_TCM = adata.obs["predicted.celltype.l2"]!= "CD8 TCM"
f_NK_Proliferating = adata.obs["predicted.celltype.l2"]!= "NK Proliferating"
f_Plasmablast = adata.obs["predicted.celltype.l2"]!= "Plasmablast"
f_cDC1 = adata.obs["predicted.celltype.l2"]!= "cDC1"
f_dnT = adata.obs["predicted.celltype.l2"]!= "dnT"

f_Platelet = adata.obs["predicted.celltype.l2"]!= "Platelet"

In [None]:
ad_L2_clean = adata[f_ASDC & f_CD4_Naive & f_CD4_Proliferating & f_CD8_Proliferating & f_CD8_TCM &
                    f_NK_Proliferating & f_Plasmablast & f_cDC1 & f_dnT & f_Platelet]

In [None]:
ad_proband_L2_clean = ad_L2_clean[ad_L2_clean.obs["Sample"]=="Proband"]
ad_controls_L2_clean = ad_L2_clean[ad_L2_clean.obs["Sample2"]=="Control"]

In [None]:
print(ad_proband_L2_clean.shape)
print(ad_controls_L2_clean.shape)

### Plots

### L1

In [None]:
ad_controls_L2_clean.obs.groupby("predicted.celltype.l1")['predicted.celltype.l1'].value_counts()

In [None]:
x,y = 'predicted.celltype.l1', 'CleanAneu.l3'
percent = (ad_controls_L2_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["CleanAneu"],
                kind="bar", height=3, aspect=1)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
ad_controls_L2_clean.obs.groupby('predicted.celltype.l1')["CleanAneu.l3"].value_counts()

In [None]:
x,y = 'predicted.celltype.l1', 'CleanAneu.l3'
percent = (ad_proband_L2_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["CleanAneu"],
                kind="bar", height=3, aspect=1)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

### L2

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent = (ad_controls_L2_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=2.4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l2', 'Aneu.l3'
percent = (ad_proband_L2_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=2.4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

### L3

In [None]:
ad_controls_L2_clean.obs.groupby("predicted.celltype.l3")['predicted.celltype.l3'].value_counts()

In [None]:
# Celltypes to remove
f_ASDC_mDC = ad_L2_clean.obs["predicted.celltype.l3"]!= "ASDC_mDC"
f_CD4_TEM_3 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD4 TEM_3"
f_CD8_TCM_2 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TCM_2"
f_CD8_TCM_3 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TCM_3"
f_CD8_TEM_3 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TEM_3"
f_CD8_TEM_4 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TEM_4"
f_Platelet_L3 = ad_L2_clean.obs["predicted.celltype.l3"]!= "Platelet"
f_dnT_2 = ad_L2_clean.obs["predicted.celltype.l3"]!= "dnT_2"

f_B_memory_lambda = ad_L2_clean.obs["predicted.celltype.l3"]!= "B memory lambda"
f_CD4_Naive = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD4 Naive"
f_CD8_Naive = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 Naive"
f_NK_1 = ad_L2_clean.obs["predicted.celltype.l3"]!= "NK_1"
f_cDC2_1 = ad_L2_clean.obs["predicted.celltype.l3"]!= "cDC2_1"


In [None]:
ad_L3_clean = ad_L2_clean[f_ASDC_mDC & f_CD4_TEM_3 & f_CD8_TCM_2 & f_CD8_TCM_3 & f_CD8_TEM_3 & f_CD8_TEM_4 &
                          f_Platelet_L3 & f_dnT_2 & f_B_memory_lambda & f_CD4_Naive & f_CD8_Naive &
                          f_NK_1 & f_cDC2_1]

ad_proband_L3_clean = ad_L3_clean[ad_L3_clean.obs["Sample"]=="Proband"]
ad_controls_L3_clean = ad_L3_clean[ad_L3_clean.obs["Sample2"]=="Control"]

In [None]:
x,y = 'predicted.celltype.l3', 'Aneu.l3'
percent = (ad_controls_L3_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=3.4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
x,y = 'predicted.celltype.l3', 'Aneu.l3'
percent = (ad_proband_L3_clean.obs.groupby(x)[y].value_counts(normalize=True)
                          .mul(100)
                          .rename('percent')
                          .reset_index())
g = sns.catplot(x=x, y="percent", data=percent, hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=3.4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
ad_proband_L3_clean.obs.groupby("predicted.celltype.l3")['predicted.celltype.l3'].value_counts()

### Chromosomes

In [None]:
adata.obs["GLs"].value_counts(normalize=True)

In [None]:
aneu_proband = aneu[aneu.obs["Sample"]=="Proband"]

In [None]:
aneu_proband.obs["GLs"].value_counts(normalize=True)

In [None]:
plt.subplots(figsize=(3,3))
aneu_proband.obs["GLs"].value_counts(normalize=True)[:12].plot(kind="bar")
plt.ylim(0,0.15)

In [None]:
pepe = (aneu_proband.obs["GLs"].value_counts(normalize=True)[:12]
                    .mul(100)
                    .rename('percent')
                    .reset_index())
pepe.plot(kind="bar")

In [None]:
sns.barplot(x="index", y="percent", data=pepe)

In [None]:
plot = (aneu_proband.obs["GLs"].value_counts(normalize=True)[:12]
                    .mul(100)
                    .rename('percent')
                    .reset_index())
g = sns.catplot(x="index", y="percent", data=plot[:12], #hue=y, hue_order=["Aneuploid"],
                kind="bar", height=3, aspect=3.4)
plt.xticks(rotation=90)
plt.ylim(0,100)
plt.show()

In [None]:
aneu_proband.obs["AneuScore"].value_counts(normalize=True)

In [None]:
plt.subplots(figsize=(3,4))
aneu_proband.obs["AneuScore"].value_counts(normalize=True).plot(kind="bar")

In [None]:
aneu_proband_AS2 = aneu_proband[aneu_proband.obs["AneuScore"]>1]
plt.subplots(figsize=(3,3))
aneu_proband_AS2.obs["GLs"].value_counts(normalize=True)[:12].plot(kind="bar")
plt.ylim(0,0.05)

In [None]:
sc.pl.umap(aneu_proband, color=["G12", "G18", "G21"])

In [None]:
sc.pl.umap(aneu_proband, color=["G07", "G08", "G11"])

In [None]:
sc.pl.umap(aneu_proband, color=["G12", "L12", "G18", "G21"])

### Gains / Losses per celltype

#### L1

In [None]:
ad_aneu_L2_clean = ad_L2_clean[ad_L2_clean.obs["Aneu.l3"]=="Aneuploid"]
ad_aneu_proband_L2_clean = ad_aneu_L2_clean[ad_aneu_L2_clean.obs["Sample"]=="Proband"]

In [None]:
ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l1")["predicted.celltype.l1"].count()

In [None]:
ad_aneu_proband_L2_clean = ad_aneu_proband_L2_clean[ad_aneu_proband_L2_clean.obs["predicted.celltype.l1"] != "other"]

In [None]:
df_gains = ad_aneu_proband_L2_clean.obs[["predicted.celltype.l1", "G01", "G02", "G03", "G04", "G05", "G06", "G07", "G08",
                             "G09", "G10", "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", "G20",
                             "G21", "G22"]]
df_losses = ad_aneu_proband_L2_clean.obs[["predicted.celltype.l1", "L01", "L02", "L03", "L04", "L05", "L06", "L07", "L08",
                              "L09", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20",
                              "L21", "L22"]]

In [None]:
df2_gains = df_gains.groupby("predicted.celltype.l1").count()
df2_losses = df_losses.groupby("predicted.celltype.l1").count()

In [None]:
totals = ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l1")["predicted.celltype.l1"].count()
totals = pd.DataFrame(totals)
totals = totals.rename(columns={"predicted.celltype.l1": "Total"})
totals

In [None]:
df2_gains["Total"] = totals
df2_losses["Total"] = totals

In [None]:
df3_gains = df2_gains.iloc[:,:-1].div(df2_gains.Total, axis=0)
df3_losses = df2_losses.iloc[:,:-1].div(df2_losses.Total, axis=0)

In [None]:
# "scattermaps_mm.py" needs to be in the same folder as this notebook
import scattermaps_mm
from scattermaps_mm import scattermap

In [None]:
plt.figure(figsize=(8,3))
ax = scattermap(df3_gains, cmap='Reds', factor=2, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(8,3))
ax = scattermap(df3_losses, cmap='Blues', factor=2, vmax=0.8, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

#### L2

In [None]:
ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l2")["predicted.celltype.l2"].count()

In [None]:
f_ILC = ad_aneu_proband_L2_clean.obs["predicted.celltype.l2"] != "ILC"
f_pDC = ad_aneu_proband_L2_clean.obs["predicted.celltype.l2"] != "pDC"
ad_aneu_proband_L2_clean = ad_aneu_proband_L2_clean[f_ILC & f_pDC]

In [None]:
ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l2")["predicted.celltype.l2"].count()

In [None]:
df_gains = ad_aneu_proband_L2_clean.obs[["predicted.celltype.l2", "G01", "G02", "G03", "G04", "G05", "G06", "G07", "G08",
                             "G09", "G10", "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", "G20",
                             "G21", "G22"]]
df_losses = ad_aneu_proband_L2_clean.obs[["predicted.celltype.l2", "L01", "L02", "L03", "L04", "L05", "L06", "L07", "L08",
                              "L09", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20",
                              "L21", "L22"]]

In [None]:
df2_gains = df_gains.groupby("predicted.celltype.l2").count()
df2_losses = df_losses.groupby("predicted.celltype.l2").count()

In [None]:
totals = ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l2")["predicted.celltype.l2"].count()
totals = pd.DataFrame(totals)
totals = totals.rename(columns={"predicted.celltype.l2": "Total"})
totals

In [None]:
df2_gains["Total"] = totals
df2_losses["Total"] = totals

In [None]:
df3_gains = df2_gains.iloc[:,:-1].div(df2_gains.Total, axis=0)
df3_losses = df2_losses.iloc[:,:-1].div(df2_losses.Total, axis=0)

In [None]:
# "scattermaps_mm.py" needs to be in the same folder as this notebook
import scattermaps_mm
from scattermaps_mm import scattermap

In [None]:
plt.figure(figsize=(9,9))
ax = scattermap(df3_gains, cmap='Reds', factor=2.5, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(23,4))
ax = scattermap(df3_gains, cmap='Reds', factor=2.5, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(9,9))
ax = scattermap(df3_losses, cmap='Blues', factor=2.5, vmax=1, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(23,4))
ax = scattermap(df3_losses, cmap='Blues', factor=2.5, vmax=1, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

#### L3

In [None]:
ad_aneu_proband_L2_clean.obs.groupby("predicted.celltype.l3")["predicted.celltype.l3"].count()

In [None]:
# Celltypes to remove
f_B_memory_lambda = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "B memory lambda"
f_CD4_TEM_3 = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "CD4 TEM_3"
f_CD8_TCM_3 = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TCM_3"
f_CD8_TEM_3 = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TEM_3"
f_ILC = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "ILC"
f_Platelet = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "Platelet"
f_cDC2_1 = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "cDC2_1"
f_dnT_2 = ad_aneu_proband_L2_clean.obs["predicted.celltype.l3"]!= "dnT_2"

"""
f_ASDC_mDC = ad_L2_clean.obs["predicted.celltype.l3"]!= "ASDC_mDC"
f_CD8_TCM_2 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TCM_2"

f_CD8_TEM_4 = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 TEM_4"
f_CD4_Naive = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD4 Naive"
f_CD8_Naive = ad_L2_clean.obs["predicted.celltype.l3"]!= "CD8 Naive"
f_NK_1 = ad_L2_clean.obs["predicted.celltype.l3"]!= "NK_1"
"""

ad_aneu_proband_L3_clean = ad_aneu_proband_L2_clean[f_B_memory_lambda & f_CD4_TEM_3 & f_CD8_TCM_3 &
                                                    f_CD8_TEM_3 &
                                                    f_ILC & f_Platelet & f_cDC2_1 & f_dnT_2]
                                        

In [None]:
df_gains = ad_aneu_proband_L3_clean.obs[["predicted.celltype.l3", "G01", "G02", "G03", "G04", "G05", "G06", "G07", "G08",
                             "G09", "G10", "G11", "G12", "G13", "G14", "G15", "G16", "G17", "G18", "G19", "G20",
                             "G21", "G22"]]
df_losses = ad_aneu_proband_L3_clean.obs[["predicted.celltype.l3", "L01", "L02", "L03", "L04", "L05", "L06", "L07", "L08",
                              "L09", "L10", "L11", "L12", "L13", "L14", "L15", "L16", "L17", "L18", "L19", "L20",
                              "L21", "L22"]]

In [None]:
df2_gains = df_gains.groupby("predicted.celltype.l3").count()
df2_losses = df_losses.groupby("predicted.celltype.l3").count()

In [None]:
totals = ad_aneu_proband_L3_clean.obs.groupby("predicted.celltype.l3")["predicted.celltype.l3"].count()
totals = pd.DataFrame(totals)
totals = totals.rename(columns={"predicted.celltype.l3": "Total"})
totals

In [None]:
df2_gains["Total"] = totals
df2_losses["Total"] = totals

In [None]:
df3_gains = df2_gains.iloc[:,:-1].div(df2_gains.Total, axis=0)
df3_losses = df2_losses.iloc[:,:-1].div(df2_losses.Total, axis=0)

In [None]:
# "scattermaps_mm.py" needs to be in the same folder as this notebook
import scattermaps_mm
from scattermaps_mm import scattermap

In [None]:
plt.figure(figsize=(9,8))
ax = scattermap(df3_gains, cmap='Reds', factor=2.5, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(23,4))
ax = scattermap(df3_gains, cmap='Reds', factor=2.5, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(9,8))
ax = scattermap(df3_losses, cmap='Blues', factor=2.5, vmax=1, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

In [None]:
plt.figure(figsize=(23,4))
ax = scattermap(df3_losses, cmap='Blues', factor=2.5, vmax=1, square=True)
plt.tight_layout()
#plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

### Analysis of B-cells and gdT cells

### B-cells

In [None]:
adata.obs["Sample2_predicted.celltype.l1_G12"].unique()

In [None]:
#aneu_Bcells = aneu[aneu.obs["predicted.celltype.l1"] == "B"].copy()
sc.pl.umap(adata, color=["Sample2_predicted.celltype.l1_G12",],
           groups = ["Proband_B_CleanAneuG12", "Proband_B_CleanAneu"], 
           palette="prism", s=50,
           #palette={"Proband_B_CleanAneuG12": "red", "Proband_B_CleanAneu": "blue"}
          )

In [None]:
eu = adata[adata.obs["Sample2_predicted.celltype.l1_G12"] == "Proband_B_CleanEU"]
G12 = adata[adata.obs["Sample2_predicted.celltype.l1_G12"] == "Proband_B_CleanAneuG12"]
noG12 = adata[adata.obs["Sample2_predicted.celltype.l1_G12"] == "Proband_B_CleanAneu"]

eu_data = eu.obs.loc[:, "chr1":"chr22"]
G12_data = G12.obs.loc[:, "chr1":"chr22"]
noG12_data = noG12.obs.loc[:, "chr1":"chr22"]

print(eu_data.shape)
print(G12_data.shape)
print(noG12_data.shape)

In [None]:
plt.subplots(figsize=(14,1))
sns.heatmap(noG12_data, vmin=-1.5, vmax=1.5, cmap="bwr")

In [None]:
sns.clustermap(eu_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,2),
              col_cluster=False, row_cluster=False)

In [None]:
sns.clustermap(noG12_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,1),
              col_cluster=False)

In [None]:
sns.clustermap(G12_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,5),
              col_cluster=False)

In [None]:
other = adata[adata.obs["predicted.celltype.l1"]=="other"]

In [None]:
other.obs.groupby("predicted.celltype.l1")["predicted.celltype.l2"].value_counts()

### gdT cells

In [None]:
adata.obs["Sample2_predicted.celltype.l2_G18"].unique()

In [None]:
#aneu_Bcells = aneu[aneu.obs["predicted.celltype.l1"] == "B"].copy()
sc.pl.umap(adata, color=["Sample2_predicted.celltype.l2_G18",],
           groups = ["Proband_gdT_CleanAneuG18", "Proband_gdT_CleanAneu"], 
           palette="prism", s=50,
           #palette={"Proband_B_CleanAneuG12": "red", "Proband_B_CleanAneu": "blue"}
          )

In [None]:
eu = adata[adata.obs["Sample2_predicted.celltype.l2_G18"] == "Proband_gdT_CleanEU"]
G18 = adata[adata.obs["Sample2_predicted.celltype.l2_G18"] == "Proband_gdT_CleanAneuG18"]
noG18 = adata[adata.obs["Sample2_predicted.celltype.l2_G18"] == "Proband_gdT_CleanAneu"]

eu_data = eu.obs.loc[:, "chr1":"chr22"]
G18_data = G18.obs.loc[:, "chr1":"chr22"]
noG18_data = noG18.obs.loc[:, "chr1":"chr22"]

print(eu_data.shape)
print(noG18_data.shape)
print(G18_data.shape)

In [None]:
sns.clustermap(eu_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,4),
              col_cluster=False, row_cluster=False)

In [None]:
sns.clustermap(noG18_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,2),
              col_cluster=False, ) #row_cluster=False

In [None]:
sns.clustermap(G18_data, vmin=-1.7, vmax=1.7, cmap="bwr", figsize=(10,3),
              col_cluster=False, ) #row_cluster=False

In [None]:
gdT_2_4_list = {"Delta-gamma chains": ["TRGC1", "TRGC2", "TRDV1", "TRDV2", "TRDC"], 
                "CD3 chains": ["CD3G", "CD3D",],
                "Cytotoxicity": ["KLRC2", "KLRD1", "KLRG1", "GZMH", "FCRL6", "CST7",], 
                "Other gdT markers": ["TIGIT", "IKZF2", "CMC1"]
               }

sc.pl.dotplot(ad_otherT, gdT_2_4_list, 'Sample', standard_scale="var") 

In [None]:
gdT = adata[adata.obs["predicted.celltype.l2"]=="gdT"]
gdT_G18 = adata[(adata.obs["Sample2_predicted.celltype.l2_G18"]=="Control_gdT_CleanEU") |
                (adata.obs["Sample2_predicted.celltype.l2_G18"]=="Proband_gdT_CleanEU") |
                (adata.obs["Sample2_predicted.celltype.l2_G18"]=="Proband_gdT_CleanAneu") |
                (adata.obs["Sample2_predicted.celltype.l2_G18"]=="Proband_gdT_CleanAneuG18")
               ]

In [None]:
sc.pl.dotplot(gdT, gdT_2_4_list, 'Sample', standard_scale="var") 
sc.pl.dotplot(gdT_G18, gdT_2_4_list, 'Sample2_predicted.celltype.l2_G18', standard_scale="var",
             categories_order=["Control_gdT_CleanEU", "Proband_gdT_CleanEU", "Proband_gdT_CleanAneu",
                               "Proband_gdT_CleanAneuG18"]) 

## 4. Differential Expression <a class="anchor" id="diffexp"></a>
[Back to Menu](#menu)

In [None]:
# Functions
def dedf_to_rnk(de_df, outdir, dataname, samplename):
    """Generates a .rnk file from differential expression for GSEA analysis."""
    
    rnk = de_df[["names", "scores"]].copy()
    rnk.columns = ["#names", "scores"]
    rnk["#names"] = rnk["#names"].astype(str).str.upper()   # all to uppercase
    rnk.to_csv(outdir + "/" + samplename + ".rnk", sep="\t", index=False)
    return rnk

def rnk_to_geseapy(pre_res, rnk, gset, outdir, samplename):
    """Run GSEApy and merge index results with a previous table."""
    
    gene_set    = gset + ".gmt"
    pre_res_new = gspy.prerank(rnk=rnk, gene_sets= gene_set, processes=4, permutation_num=1000,
                               outdir=outdir, graph_num=60, format='png', seed=6)
    pre_res_new_df = pd.DataFrame(pre_res_new.res2d.sort_index())
    pre_res_new_df.to_csv(outdir + "/" + "GSEApy_results.tsv", sep="\t")
    pre_res     = pd.concat([pre_res, pre_res_new.res2d.sort_index()])
    n           = pre_res_new.res2d.shape[0]                                  # number of new files added
    pre_res.iloc[-n:, -1] = samplename                                        # add sample code to last column
    return pre_res

def run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res):
    """RUN all scripts"""
    
    # Organize variables & folders
    sample = groupA + "_vs_" + groupB
    outdir = DESKTOP + "/" + dataname + "_" + sample
    samplename = dataname + "_" + groupA + "_vs_" + groupB

    if not os.path.exists(outdir):
        os.mkdir(outdir)

    # Differential expression
    sc.tl.rank_genes_groups(database, score, groups=[groupA], reference=groupB, method=method, 
                        use_raw=True, log_transformed=True, n_genes=-1)
    de_df = sc.get.rank_genes_groups_df(database, group=groupA)
    de_df.to_csv(outdir + "/" + samplename + ".tsv", sep='\t', index=False) 

    with rc_context({'figure.figsize': (9, 4)}):
        sc.pl.rank_genes_groups(database, n_genes=50, save="_" + groupA + "_vs_" + groupB + ".png")
    
    # Save .rnk list adn run GSEA
    rnk = dedf_to_rnk(de_df, outdir, dataname, samplename)
    pre_res = rnk_to_geseapy(pre_res, rnk, gset, outdir, samplename)
    #pre_res.to_csv(outdir + "/GSEApy_results.tsv", sep="\t")
    #print(pre_res.head(3))
    
    return pre_res

### 4.1. For each group of similar comparisons, select the following parameters:
Gene sets:
- HALLMARKS (51 genesets): `h.all.v7.2.symbols`
- Combined HALLMARKS and others (2000 genesets): `h.c2.cp.v7.2.symbols_mix`
- MAD1_selected: `MAD1_selected`, `MAD1_selected_30`

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

In [None]:
# EDIT
database = adata            # adata (all samples) or subset of data
dataname = "ad"

method = 'wilcoxon'         # 't-test' or 'wilcoxon'

### 4.2. Run the following cells for each comparison in the same group

Columns 
- `Sample2_CleanAneu.l3`
- `Sample2_predicted.celltype.l2_CleanAneu.l3`
- `Sample2_predicted.celltype.l3_CleanAneu.l3`


## Proband vs controls

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

score = "Sample2"

groupA = "Proband"             
groupB = "Control"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

### Proband vs controls - L1

In [None]:
# Select Pathways
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

# Column and groups
score = "Sample2_predicted.celltype.l1"            # column in adata.obs

groupA = "Proband_CD8 T"              # label for score in adata.obs
groupB = "Control_CD8 T"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 T"              # label for score in adata.obs
groupB = "Control_CD4 T"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK"              # label for score in adata.obs
groupB = "Control_NK"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Mono"              # label for score in adata.obs
groupB = "Control_Mono"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B"              # label for score in adata.obs
groupB = "Control_B"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_other T"              # label for score in adata.obs
groupB = "Control_other T"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_DC"              # label for score in adata.obs
groupB = "Control_DC"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_other"              # label for score in adata.obs
groupB = "Control_other"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)


pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")


## Proband vs Controls - L2

#### Sample2_predicted.celltype.l2

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l2"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

In [None]:
# Select Pathways
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")


score = "Sample2_predicted.celltype.l2"            # column in adata.obs

groupA = "Proband_B intermediate"              # label for score in adata.obs
groupB = "Control_B intermediate"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B memory"              # label for score in adata.obs
groupB = "Control_B memory"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive"              # label for score in adata.obs
groupB = "Control_B naive"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD14 Mono"              # label for score in adata.obs
groupB = "Control_CD14 Mono"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD16 Mono"              # label for score in adata.obs
groupB = "Control_CD16 Mono"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM"              # label for score in adata.obs
groupB = "Control_CD4 TCM"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TEM"              # label for score in adata.obs
groupB = "Control_CD4 TEM"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 Naive"              # label for score in adata.obs
groupB = "Control_CD8 Naive"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TCM"              # label for score in adata.obs
groupB = "Control_CD8 TCM"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM"              # label for score in adata.obs
groupB = "Control_CD8 TEM"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_cDC2"              # label for score in adata.obs
groupB = "Control_cDC2"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_dnT"              # label for score in adata.obs
groupB = "Control_dnT"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT"              # label for score in adata.obs
groupB = "Control_gdT"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_HSPC"              # label for score in adata.obs
groupB = "Control_HSPC"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_ILC"              # label for score in adata.obs
groupB = "Control_ILC"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_MAIT"              # label for score in adata.obs
groupB = "Control_MAIT"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK"              # label for score in adata.obs
groupB = "Control_NK"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK Proliferating"              # label for score in adata.obs
groupB = "Control_NK Proliferating"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CD56bright"              # label for score in adata.obs
groupB = "Control_NK_CD56bright"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_pDC"              # label for score in adata.obs
groupB = "Control_pDC"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Platelet"              # label for score in adata.obs
groupB = "Control_Platelet"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Treg"              # label for score in adata.obs
groupB = "Control_Treg"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

## Aneuploid vs Euploid

## All celltypes combined

In [None]:
adata.obs["Sample2_CleanAneu.l3"].unique()

In [None]:
# Select Pathways
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

# Column and groups
score = "Sample2_CleanAneu.l3"

groupA = "Proband_CleanAneu"              # label for score in adata.obs
groupB = "Control_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CleanEU"              # label for score in adata.obs
groupB = "Control_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

### Sample2.CleanAneuStable (removing B_G12 and gdT_G18)

In [None]:
# Select Pathways
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "h.c2.cp.v7.2.symbols_mix"   # h.c2.cp.v7.2.symbols_mix


# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

# Column and groups
score = "Sample2_CleanAneu.l3.Stable"

groupA = "Proband_CleanAneu"              # label for score in adata.obs
groupB = "Control_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CleanEU"              # label for score in adata.obs
groupB = "Control_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

## L1

In [None]:
adata.obs["Sample2_predicted.celltype.l1_CleanAneu.l3"].value_counts()

In [None]:
list_samples = adata.obs["Sample2_predicted.celltype.l1_CleanAneu.l3"].unique().tolist()
list_samples = [x for x in list_samples if not x.endswith("Dirty")]
list_samples = [x for x in list_samples if not x.startswith("Mother")]
list_samples = [x for x in list_samples if not x.startswith("Father")]

In [None]:
list_samples

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l1_CleanAneu.l3"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

In [None]:
# Select Pathways
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

score = "Sample2_predicted.celltype.l1_CleanAneu.l3"            # column in adata.obs

groupA = "Proband_CD8 T_CleanEU"              # label for score in adata.obs
groupB = "Control_CD8 T_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 T_CleanEU"              # label for score in adata.obs
groupB = "Control_CD4 T_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CleanEU"              # label for score in adata.obs
groupB = "Control_NK_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Mono_CleanEU"              # label for score in adata.obs
groupB = "Control_Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanEU"              # label for score in adata.obs
groupB = "Control_B_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_other T_CleanEU"              # label for score in adata.obs
groupB = "Control_other T_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_DC_CleanEU"              # label for score in adata.obs
groupB = "Control_DC_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_other_CleanEU"              # label for score in adata.obs
groupB = "Control_other_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

### B-cells with G12 vs non-G12

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l1_G12"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")


score = "Sample2_predicted.celltype.l1_G12"

groupA = "Proband_B_CleanAneuG12"              
groupB = "Control_B_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanAneu"              
groupB = "Control_B_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanAneuG12"              
groupB = "Proband_B_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanAneu"              
groupB = "Proband_B_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanEU"              
groupB = "Control_B_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B_CleanAneuG12"              
groupB = "Proband_B_CleanAneu"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

### gdT-cells with G18 vs non-G18

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l2_G18"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected_30"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")


score = "Sample2_predicted.celltype.l2_G18"

groupA = "Proband_gdT_CleanAneuG18"              
groupB = "Control_gdT_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneu"              
groupB = "Control_gdT_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneuG18"              
groupB = "Proband_gdT_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneu"              
groupB = "Proband_gdT_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanEU"              
groupB = "Control_gdT_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneuG18"              
groupB = "Proband_gdT_CleanAneu"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)


pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

# OTHERS

### gdT G18 aneuploid vs non-G18 aneuploid

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "gdTcells"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

score = "Sample2_predicted.celltype.l2_G18"

groupA = "Proband_gdT_CleanAneuG18"             # label for score in adata.obs
groupB = "Proband_gdT_CleanAneu"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

### B G12 aneuploid vs non-G12 aneuploid

In [None]:
# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

score = "Sample2_predicted.celltype.l1_G12"

groupA = "Proband_B_CleanAneuG12"             # label for score in adata.obs
groupB = "Proband_B_CleanAneu"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

### Only platelets

In [None]:
score = "Sample2_predicted.celltype.l2"

GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "platelets"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

groupA = "Proband_Platelet"              # label for score in adata.obs
groupB = "Control_Platelet"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#### Sample2_predicted.celltype.l2_CleanAneu.l3

Without data (n<3): "Plasmablast", "ASDC", "NK Proliferating", "CD4 Naive", "cDC2", "NK_CD56bright", "CD8 Proliferating", "pDC", "cDC1", "B memory", "CD4 Proliferating", "MAIT", "Treg", "CD8 Naive", "HSPC", "ILC"

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l2_CleanAneu.l3"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

### A: Proband.CleanAneu vs Control.CleanEU

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected"

#h.c2.cp.v7.2.symbols_mix

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

In [None]:
score = "Sample2_predicted.celltype.l2_CleanAneu.l3"

groupA = "Proband_B intermediate_CleanAneu"              
groupB = "Control_B intermediate_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B memory_CleanAneu"              
groupB = "Control_B memory_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive_CleanAneu"              
groupB = "Control_B naive_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD14 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD14 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD16 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD16 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TCM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TEM_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 Naive_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 Naive_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_CD8 TCM_CleanAneu"              # label for score in adata.obs
#groupB = "Control_CD8 TCM_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_cDC2_CleanAneu"              # label for score in adata.obs
groupB = "Control_cDC2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_dnT_CleanAneu"              # label for score in adata.obs
#groupB = "Control_dnT_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneu"              # label for score in adata.obs
groupB = "Control_gdT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_HSPC_CleanAneu"              # label for score in adata.obs
#groupB = "Control_HSPC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_ILC_CleanAneu"              # label for score in adata.obs
#groupB = "Control_ILC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_MAIT_CleanAneu"              # label for score in adata.obs
groupB = "Control_MAIT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_NK_Proliferating_CleanAneu"              # label for score in adata.obs
#groupB = "Control_NK_Proliferating_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CD56bright_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK_CD56bright_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_pDC_CleanAneu"              # label for score in adata.obs
#groupB = "Control_pDC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Platelet_CleanAneu"              # label for score in adata.obs
groupB = "Control_Platelet_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Treg_CleanAneu"              # label for score in adata.obs
groupB = "Control_Treg_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

### B: Proband.CleanAneu vs Proband.CleanEU

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected"

#h.c2.cp.v7.2.symbols_mix

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

In [None]:
score = "Sample2_predicted.celltype.l2_CleanAneu.l3"

groupA = "Proband_B intermediate_CleanAneu"              
groupB = "Proband_B intermediate_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B memory_CleanAneu"              
groupB = "Proband_B memory_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive_CleanAneu"              
groupB = "Proband_B naive_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD14 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD14 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD16 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD16 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD4 TCM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TEM_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD4 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 Naive_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD8 Naive_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_CD8 TCM_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_CD8 TCM_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_CleanAneu"              # label for score in adata.obs
groupB = "Proband_CD8 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_cDC2_CleanAneu"              # label for score in adata.obs
groupB = "Proband_cDC2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_dnT_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_dnT_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanAneu"              # label for score in adata.obs
groupB = "Proband_gdT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_HSPC_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_HSPC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_ILC_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_ILC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_MAIT_CleanAneu"              # label for score in adata.obs
groupB = "Proband_MAIT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CleanAneu"              # label for score in adata.obs
groupB = "Proband_NK_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_NK_Proliferating_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_NK_Proliferating_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CD56bright_CleanAneu"              # label for score in adata.obs
groupB = "Proband_NK_CD56bright_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_pDC_CleanAneu"              # label for score in adata.obs
#groupB = "Proband_pDC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Platelet_CleanAneu"              # label for score in adata.obs
groupB = "Proband_Platelet_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Treg_CleanAneu"              # label for score in adata.obs
groupB = "Proband_Treg_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

### C: Proband.CleanEU vs Control.CleanEU

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "MAD1_selected"

#h.c2.cp.v7.2.symbols_mix

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

In [None]:
score = "Sample2_predicted.celltype.l2_CleanAneu.l3"

groupA = "Proband_B intermediate_CleanEU"              
groupB = "Control_B intermediate_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B memory_CleanEU"              
groupB = "Control_B memory_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive_CleanEU"              
groupB = "Control_B naive_CleanEU"                
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD14 Mono_CleanEU"              # label for score in adata.obs
groupB = "Control_CD14 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD16 Mono_CleanEU"              # label for score in adata.obs
groupB = "Control_CD16 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_CleanEU"              # label for score in adata.obs
groupB = "Control_CD4 TCM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TEM_CleanEU"              # label for score in adata.obs
groupB = "Control_CD4 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 Naive_CleanEU"              # label for score in adata.obs
groupB = "Control_CD8 Naive_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_CD8 TCM_CleanEU"              # label for score in adata.obs
#groupB = "Control_CD8 TCM_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_CleanEU"              # label for score in adata.obs
groupB = "Control_CD8 TEM_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_cDC2_CleanEU"              # label for score in adata.obs
groupB = "Control_cDC2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_dnT_CleanEU"              # label for score in adata.obs
#groupB = "Control_dnT_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_CleanEU"              # label for score in adata.obs
groupB = "Control_gdT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_HSPC_CleanEU"              # label for score in adata.obs
#groupB = "Control_HSPC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_ILC_CleanEU"              # label for score in adata.obs
#groupB = "Control_ILC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_MAIT_CleanEU"              # label for score in adata.obs
groupB = "Control_MAIT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CleanEU"              # label for score in adata.obs
groupB = "Control_NK_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_NK_Proliferating_CleanEU"              # label for score in adata.obs
#groupB = "Control_NK_Proliferating_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CD56bright_CleanEU"              # label for score in adata.obs
groupB = "Control_NK_CD56bright_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

#groupA = "Proband_pDC_CleanEU"              # label for score in adata.obs
#groupB = "Control_pDC_CleanEU"                # label for score in adata.obs
#pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Platelet_CleanEU"              # label for score in adata.obs
groupB = "Control_Platelet_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Treg_CleanEU"              # label for score in adata.obs
groupB = "Control_Treg_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

## L3

In [None]:
GENESETS = "/Users/mmm/BioPROJECTS/Public_Data/Signatures & Gene Sets/GSEA Gene Sets/"
gset = GENESETS + "h.c2.cp.v7.2.symbols_mix"

# Empty dataframe to accumulate all GSEA results
pre_res = pd.read_csv(GENESETS + "pre_res_empy.tsv", sep="\t", index_col="Term")

Without data (n<3): "Plasmablast", "ASDC", "NK Proliferating", "CD4 Naive", "cDC2", "NK_CD56bright", "CD8 Proliferating", "pDC", "cDC1", "B memory", "CD4 Proliferating", "MAIT", "Treg", "CD8 Naive", "HSPC", "ILC"

In [None]:
sample_numbers = adata.obs["Sample2_predicted.celltype.l3_CleanAneu.l3"].value_counts()
sample_numbers = pd.DataFrame(sample_numbers)
sample_numbers.to_csv(DESKTOP + "sample_numbers.tsv", sep="\t")

In [None]:
score = "Sample2_predicted.celltype.l3_CleanAneu.l3"            # column in adata.obs

groupA = "Proband_Platelet_CleanAneu"              # label for score in adata.obs
groupB = "Control_Platelet_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B intermediate kappa_CleanAneu"              # label for score in adata.obs
groupB = "Control_B intermediate kappa_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B intermediate lambda_CleanAneu"              # label for score in adata.obs
groupB = "Control_B intermediate lambda_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_gdT_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_4_CleanAneu"              # label for score in adata.obs
groupB = "Control_gdT_4_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)


groupA = "Proband_B memory kappa_CleanAneu"              # label for score in adata.obs
groupB = "Control_B memory kappa_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B memory lambda_CleanAneu"              # label for score in adata.obs
groupB = "Control_B memory lambda_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive kappa_CleanAneu"              # label for score in adata.obs
groupB = "Control_B naive kappa_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_B naive lambda_CleanAneu"              # label for score in adata.obs
groupB = "Control_B naive lambda_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD14 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD14 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD16 Mono_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD16 Mono_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_1_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TCM_1_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TCM_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TCM_3_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TCM_3_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD4 TEM_1_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD4 TEM_1_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 Naive_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 Naive_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_1_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 TEM_1_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 TEM_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_5_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 TEM_5_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_CD8 TEM_6_CleanAneu"              # label for score in adata.obs
groupB = "Control_CD8 TEM_6_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_cDC2_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_cDC2_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_dnT_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_dnT_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_1_CleanAneu"              # label for score in adata.obs
groupB = "Control_gdT_1_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_gdT_3_CleanAneu"              # label for score in adata.obs
groupB = "Control_gdT_3_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_MAIT_CleanAneu"              # label for score in adata.obs
groupB = "Control_MAIT_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK Proliferating_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK Proliferating_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_2_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK_2_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_4_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK_4_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_NK_CD56bright_CleanAneu"              # label for score in adata.obs
groupB = "Control_NK_CD56bright_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)

groupA = "Proband_Treg Memory_CleanAneu"              # label for score in adata.obs
groupB = "Control_Treg Memory_CleanEU"                # label for score in adata.obs
pre_res = run_all(DESKTOP, database, dataname, score, groupA, groupB, method, pre_res)


pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

#### --> **REPEAT FOR EACH COMPARISON: BACK to 5.2**  

### 4.3. Run the cell below **after finishing all comparisons in 4.2**

In [None]:
pre_res.to_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")

## 5. Pathway Plots <a class="anchor" id="pathways"></a>
[Back to Menu](#menu)  

Select pathways and samples to plot and save to `GSEApy_Combined_results.tsv`

In [None]:
pathways = pd.read_csv(DESKTOP + "GSEApy_Combined_results.tsv", sep="\t")
pathways.head(3)

In [None]:
pre_res_nes = pathways.pivot_table(index='Sample', columns="Term", values='nes')
pre_res_nes

In [None]:
sample_order_Samples = ["ad_Proband_CleanAneu_vs_Control_CleanEU",
                        "ad_Proband_CleanAneu_vs_Proband_CleanEU",
                        "ad_Proband_CleanEU_vs_Control_CleanEU"]

sample_order_L1 = ["ad_Proband_CD4 T_CleanAneu_vs_Control_CD4 T_CleanEU", 
                   "ad_Proband_CD8 T_CleanAneu_vs_Control_CD8 T_CleanEU", 
                   "ad_Proband_NK_CleanAneu_vs_Control_NK_CleanEU", 
                   "ad_Proband_other T_CleanAneu_vs_Control_other T_CleanEU", 
                   "ad_Proband_B_CleanAneu_vs_Control_B_CleanEU", 
                   "ad_Proband_Mono_CleanAneu_vs_Control_Mono_CleanEU", 
                   "ad_Proband_DC_CleanAneu_vs_Control_DC_CleanEU", 
                   "ad_Proband_other_CleanAneu_vs_Control_other_CleanEU"]

sample_order_L2 = ["ad_Proband_CD4 TEM_CleanAneu_vs_Control_CD4 TEM_CleanEU", 
                   "ad_Proband_CD4 TCM_CleanAneu_vs_Control_CD4 TCM_CleanEU", 
                   "ad_Proband_CD8 TCEM_CleanAneu_vs_Control_CD8 TEM_CleanEU", 
                   "ad_Proband_NK_CleanAneu_vs_Control_NK_CleanEU", 
                   "ad_Proband_dnT_CleanAneu_vs_Control_dnT_CleanEU", 
                   "ad_Proband_gdT_CleanAneu_vs_Control_gdT_CleanEU",
                   "ad_Proband_B naive_CleanAneu_vs_Control_B naive_CleanEU", 
                   "ad_Proband_B intermediate_CleanAneu_vs_Control_B intermediate_CleanEU", 
                   "ad_Proband_CD14 Mono_CleanAneu_vs_Control_CD14 Mono_CleanEU", 
                   "ad_Proband_CD16 Mono_CleanAneu_vs_Control_CD16 Mono_CleanEU", 
                   "ad_Proband_Platelet_CleanAneu_vs_Control_Platelet_CleanEU"]

sample_order_L3 = ["ad_Proband_CD4 TCM_1_CleanAneu_vs_Control_CD4 TCM_1_CleanEU", 
                   "ad_Proband_CD4 TCM_2_CleanAneu_vs_Control_CD4 TCM_2_CleanEU", 
                   "ad_Proband_CD4 TEM_1_CleanAneu_vs_Control_CD4 TEM_1_CleanEU", 
                   "ad_Proband_CD8 Naive_2_CleanAneu_vs_Control_CD8 Naive_2_CleanEU",
                   "ad_Proband_CD8 TEM_1_CleanAneu_vs_Control_CD8 TEM_1_CleanEU",
                   "ad_Proband_CD8 TEM_2_CleanAneu_vs_Control_CD8 TEM_1_CleanEU",
                   "ad_Proband_CD8 TEM_5_CleanAneu_vs_Control_CD8 TEM_1_CleanEU",
                   "ad_Proband_CD8 TEM_6_CleanAneu_vs_Control_CD8 TEM_1_CleanEU",
                   "ad_Proband_NK Proliferating_CleanAneu_vs_Control_NK Proliferating_CleanEU",
                   "ad_Proband_NK_2_CleanAneu_vs_Control_NK_2_CleanEU",
                   "ad_Proband_NK_4_CleanAneu_vs_Control_NK_4_CleanEU",
                   "ad_Proband_NK_CD56bright_CleanAneu_vs_Control_NK_CD56bright_CleanEU",
                   "ad_Proband_Treg Memory_CleanAneu_vs_Control_Treg Memory_CleanEU",
                   "ad_Proband_MAIT_CleanAneu_vs_Control_MAIT_CleanEU",
                   "ad_Proband_dnT_2_CleanAneu_vs_Control_dnT_2_CleanEU",
                   "ad_Proband_gdT_1_CleanAneu_vs_Control_gdT_1_CleanEU",
                   "ad_Proband_gdT_2_CleanAneu_vs_Control_gdT_1_CleanEU",
                   "ad_Proband_gdT_3_CleanAneu_vs_Control_gdT_1_CleanEU",
                   "ad_Proband_gdT_4_CleanAneu_vs_Control_gdT_1_CleanEU",
                   "ad_Proband_B naive kappa_CleanAneu_vs_Control_B naive kappa_CleanEU",
                   "ad_Proband_B naive lambda_CleanAneu_vs_Control_B naive lambda_CleanEU",
                   "ad_Proband_B intermediate kappa_CleanAneu_vs_Control_B intermediate kappa_CleanEU",
                   "ad_Proband_B intermediate lambda_CleanAneu_vs_Control_B intermediate lambda_CleanEU",
                   "ad_Proband_B memory kappa_CleanAneu_vs_Control_B memory kappa_CleanEU",
                   "ad_Proband_B memory lambda_CleanAneu_vs_Control_B memory lambda_CleanEU",
                   "ad_Proband_CD14 Mono_CleanAneu_vs_Control_CD14 Mono_CleanEU",
                   "ad_Proband_CD16 Mono_CleanAneu_vs_Control_CD16 Mono_CleanEU",
                   "ad_Proband_cDC2_2_CleanAneu_vs_Control_cDC2_2_CleanEU",
                   "ad_Proband_Platelet_CleanAneu_vs_Control_Platelet_CleanEU"]

sample_order = sample_order_Samples

In [None]:
pre_res_nes = pre_res_nes.reindex(index=sample_order)

In [None]:
# Prepare a template for the right order of pathways
pre_res_nes.to_csv(DESKTOP + "template.tsv", sep="\t")

In [None]:
template = pd.read_csv(DESKTOP + "template_pathways_ordered.tsv", sep="\t")
print(template.head(3))

pathways_list = template["Pathways"].tolist()

#pre_res_nes = pd.concat([template, pre_res_nes], axis=0)
pre_res_nes = pre_res_nes[pathways_list]
pre_res_nes

In [None]:
# Use pval or fdr
pre_res_fdr = pathways.pivot_table(index='Sample', columns="Term", values='fdr')
pre_res_fdr = pre_res_fdr.reindex(index=sample_order)
pre_res_fdr = pre_res_fdr[pathways_list]
pre_res_fdr

In [None]:
# Transform to -log
import numpy as np
size = -np.log(pre_res_fdr)
size2 = size.replace(np.inf, 3)

In [None]:
# "scattermaps_mm.py" needs to be in the same folder as this notebook
import scattermaps_mm
from scattermaps_mm import scattermap

In [None]:
plt.figure(figsize=(20,10))
ax = scattermap(pre_res_nes, cmap='coolwarm', marker_size=size2, factor=40, vmin=-2, vmax=2, 
                square=True)
plt.tight_layout()
plt.savefig(DESKTOP + "GSEApy_dotPlot.png")

## Most specific genes in the proband

In [None]:
proband_genes_controls = ["MTRNR2L8", "MT-ATP6", "MTRNR2L12", "MT-ND4", "MT-CO2", "MT-CYB", "MT-CO3", "MT-CO1",
                "MT-ND1", "HLA-DRB5", "MT-ND3", "MTRNR2L1", "MT-ND5", "HLA-B", "RPL17", "CCL5", "IRF1",
                "ZEB2", "LINC-PINT", "MTRNR2L10", "MT-ND2", "NEAT1", "AC020916.1", "REL"]

proband_genes_others = ["MTRNR2L8", "MTRNR2L12", "MT-ATP6", "MT-CO2", "MT-ND4", "MT-CYB", "MT-ND1", "MT-ND3", 
                        "MT-CO1", "MT-CO3", "HLA-DRB5", "CCL5", "IRF1", "HLA-B", "MTRNR2L10", "HSPA5",
                        "DDX3X", "LINC-PINT", "MT2A", "GZMH",
                        "TRDC"]

proband_vs_controls_selected = {
    "Humanins": ["MTRNR2L8", "MTRNR2L12", "MTRNR2L10"],
    "Mitochondrial genes": ["MT-ATP6", "MT-CO2", "MT-ND4", "MT-CYB", "MT-ND1", "MT-ND3", "MT-CO1"],
    "Antigen presentation": ["HLA-DRB5", "HLA-DQA2"],
    "Interferon pathway": ["IRF1", "MT2A"],
    "NFkB pathway": ["CCL5", "REL"],
    "lncRNAs": ["LINC-PINT", "NEAT1"],
}


In [None]:
sc.pl.dotplot(adata, proband_genes_controls, groupby="Sample", standard_scale="var")
sc.pl.dotplot(adata, proband_genes_others, groupby="Sample", standard_scale="var")
sc.pl.dotplot(adata, proband_vs_controls_selected, groupby="Sample", ) #standard_scale="var"

In [None]:
sc.pl.dotplot(ad_proband, proband_genes_others, groupby="predicted.celltype.l1", standard_scale="var")
sc.pl.dotplot(ad_proband, proband_vs_controls_selected, groupby="predicted.celltype.l1", 
              categories_order=L1, ) #standard_scale="var"


In [None]:
cells_to_remove = ['CD8 Proliferating', 'cDC1', 'Plasmablast', 
                   'CD4 Proliferating', 'CD4 Naive']
ad_proband_clean = ad_proband[~ad_proband.obs["predicted.celltype.l2"].isin(cells_to_remove)]

In [None]:
ad_proband_clean.obs["predicted.celltype.l2"].unique()

In [None]:
sc.pl.dotplot(ad_proband_clean, proband_vs_controls_selected, groupby="predicted.celltype.l2", 
              categories_order=L2, standard_scale="var", save="proband_selected_genes_L2.png")

## Most specific genes in Aneuploid cells

In [None]:
samples_to_keep = ["Proband_CleanAneu", "Proband_CleanEU", "Control_CleanEU"]
ad_Samples2_CleanAneu_L3 = adata[adata.obs["Sample2_CleanAneu.l3"].isin(samples_to_keep)]

In [None]:
sc.pl.dotplot(ad_Samples2_CleanAneu_L3, proband_vs_controls_selected, groupby="Sample2_CleanAneu.l3",
             #standard_scale="var",
             categories_order=["Proband_CleanAneu", "Proband_CleanEU", "Control_CleanEU"]
             )

In [None]:
ad_Samples2_CleanAneu_L3_Proband_CleanAneu = adata[adata.obs["Sample2_CleanAneu.l3"]=="Proband_CleanAneu"]
sc.pl.dotplot(ad_Samples2_CleanAneu_L3_Proband_CleanAneu, proband_vs_controls_selected, 
              groupby="predicted.celltype.l1", 
              categories_order=L1, )  #standard_scale="var"

In [None]:
ad_Samples2_CleanAneu_L3_Proband_CleanEU = adata[adata.obs["Sample2_CleanAneu.l3"]=="Proband_CleanEU"]
sc.pl.dotplot(ad_Samples2_CleanAneu_L3_Proband_CleanEU, proband_vs_controls_selected, 
              groupby="predicted.celltype.l1", 
              categories_order=L1, )

In [None]:
ad_Samples2_CleanAneu_L3_Control_CleanEU = adata[adata.obs["Sample2_CleanAneu.l3"]=="Control_CleanEU"]
sc.pl.dotplot(ad_Samples2_CleanAneu_L3_Control_CleanEU, proband_vs_controls_selected, 
              groupby="predicted.celltype.l1", 
              categories_order=L1, )

## 6. Tumor signatures <a class="anchor" id="tumor"></a>
[Back to Menu](#menu) 

**CLL 44 gene-signature**.  
Cornet et al PLoS One: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0128990

In [None]:
CLL_44 = ["ABCA6", "ADAM29", "AICDA", "BIK", "BMI1", "BUB1B", "CD200", "CD24", "TNFRSF17", "CD27",
         "CD5", "CHIT1", "CLLU1", "CNR1", "COL9A2", "CTLA4", "CXCR3", "DMD", "DNMBP", "FCMR",
         "FCER2", "FGF2", "FGFR1", "FILIP1L", "FLT3", "FMOD", "GNRH1", "IGFBP4", "IGHG1", "IGHG2", "IGHG3",
          "IGHG4", "IGSF3", "IL2RA", "IGKC", "IGLC7", "LEF1", "LILRA4", "LPL", "RAPGEF3", "RASGRF1",
          "ROR1", "SELP", "SEPTIN10", "SFMBT1", "TTN", "WNT3"]

CLL_13 = ["BMI1", "CD200", "CD27", "CD5", "COL9A2", "DNMBP", "FCMR", "GNRH1", "LEF1", "RASGRF1", "ROR1", 
          "SFMBT1", "TTN"]

In [None]:
sc.tl.score_genes(adata, CLL_44, ctrl_size=50, gene_pool=None, n_bins=25, 
                  score_name='CCL_44_signature', random_state=0, copy=False, use_raw=None)

In [None]:
sc.tl.score_genes(adata, CLL_13, ctrl_size=50, gene_pool=None, n_bins=25, 
                  score_name='CCL_13_signature', random_state=0, copy=False, use_raw=None)

In [None]:
sc.pl.violin(adata, ["CCL_44_signature"], groupby="Sample2")

In [None]:
sc.pl.dotplot(adata, ["CCL_44_signature"], groupby="Sample2")

In [None]:
ad_B_cells = adata[adata.obs["predicted.celltype.l1"]=="B"]

In [None]:
sc.pl.dotplot(ad_B_cells, ["CCL_44_signature"], groupby="Sample")

In [None]:
sc.pl.dotplot(ad_B_cells, ["CCL_13_signature"], groupby="Sample")

In [None]:
sc.pl.violin(ad_B_cells, ["CCL_13_signature", "CCL_44_signature"], groupby="Sample2_predicted.celltype.l1_G12",
             order=["Control_B_CleanEU", "Proband_B_CleanEU", "Proband_B_CleanAneu", "Proband_B_CleanAneuG12"], 
             rotation=90,
             )

In [None]:
sc.pl.dotplot(ad_B_cells, ["CCL_13_signature", "CCL_44_signature"], groupby="Sample2_predicted.celltype.l1_G12",
             #categories_order=["Proband_B_CleanAneu", "Proband_B_CleanAneuG12"],
             )

In [None]:
sc.pl.violin(ad_B_cells, ["CCL_44_signature", "CCL_13_signature"], groupby="Sample", jitter=0.25, rotation=30)

In [None]:
sc.pl.dotplot(ad_B_cells, CLL_44, groupby="Sample", standard_scale="var")

In [None]:
CLL_44_noIGKC = ["ABCA6", "ADAM29", "AICDA", "BIK", "BMI1", "BUB1B", "CD200", "CD24", "TNFRSF17", "CD27",
         "CD5", "CHIT1", "CLLU1", "CNR1", "COL9A2", "CTLA4", "CXCR3", "DMD", "DNMBP", "FCMR",
         "FCER2", "FGF2", "FGFR1", "FILIP1L", "FLT3", "FMOD", "GNRH1", "IGFBP4", "IGHG1", "IGHG2", "IGHG3",
          "IGHG4", "IGSF3", "IL2RA", "IGLC7", "LEF1", "LILRA4", "LPL", "RAPGEF3", "RASGRF1",
          "ROR1", "SELP", "SEPTIN10", "SFMBT1", "TTN", "WNT3"]

In [None]:
sc.pl.dotplot(ad_B_cells, CLL_44_noIGKC, groupby="Sample", standard_scale="var")

In [None]:
sc.pl.dotplot(ad_B_cells, CLL_13, groupby="Sample", standard_scale="var")

In [None]:
adata.write(DATA + "220207_MAD1_scRNAseq.h5ad")

### ALL
- Ferrando: https://www.sciencedirect.com/science/article/pii/S1535610802000181
- https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1323166/

In [None]:
ALL_7 = ["CD44", "TBXAS1", "GRAP2"]

In [None]:
sc.pl.umap(adata, color=ALL_7)

In [None]:
sc.pl.dotplot(adata,["TAL1", "LMO2", "TLX1", "TLX3", "HOXA"], groupby="Sample", standard_scale="var")

In [None]:
gdt4 = ['TRDC', 'TIGIT', 'KLRC2', 'TRGC2', 'IKZF2', 'GCSAM', 'FCRL6', 'TRDV1', 'CST7', 'CMC1']
gdt4 = ['TRDC', 'TIGIT', 'KLRC2', 'TRGC2', 'IKZF2', 'GCSAM', 'FCRL6', 'TRDV1']

In [None]:
sc.pl.dotplot(adata, gdt4, groupby="Sample", standard_scale="var")

In [None]:
proband = adata[adata.obs["Sample"]=="Proband"]

proband.obs.groupby("predicted.celltype.l3")["Aneu.l3"].value_counts()

In [None]:
controls = adata[adata.obs["Sample2"]=="Control"]
controls.obs.groupby("predicted.celltype.l3")["Aneu.l3"].value_counts()