# InferCNVpy on MAD1 PBMC-scRNAseq data
### Azimuth L3

@mmm, 
August 21, 2021  
New analysis using chromosome location and arm location from Ensembl GRCh38.104  
November 25, 2021  
November 26, 2021 - Final data for diff. exp. analysis

January 9, 2022 ----> Use all other samples as controls

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import infercnvpy as cnv

In [None]:
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('retina')
sc.settings.set_figure_params(figsize=(4, 4))
sc.logging.print_header()
print("InferCNV version: ", cnv.__version__)

In [None]:
# Directories
DATA = "/Users/mmm/BioPROJECTS/MAD1 & MVA/scRNAseq PBMCs/data/"
DESKTOP = "/Users/mmm/Desktop/"

# Read data with chr positions

In [None]:
adata = sc.read_h5ad(DATA + "211028_MAD1_scRNAseq.h5ad")

In [None]:
cmd = pd.read_csv(DATA + "211126_cell_metadata_initial.tsv", sep="\t")
cmd = cmd.set_index("Cell")
cmd.head(3)

In [None]:
adata.obs = cmd
adata

# Running inferCNV

Use `window=251` and data scaled on controls based on previous analyses (211125). Run inferCNV using 
"predicted.celltype.l1", "predicted.celltype.l3" and "predicted.celltype.l3" (from Azimuth analysis: https://azimuth.hubmapconsortium.org/) 

In [None]:
# Select Window size and column for classification of cells
window = 251
selected_column = "predicted.celltype.l3"

In [None]:
groups = list(set(adata.obs[selected_column]))
groups

In [None]:
# Remove cell types with few cells
groups_clean = [x for x in groups if not (x=="ASDC_mDC") | (x=="CD8 TEM_4") | (x=="NK_1") |
               (x=="CD4 Proliferating")]
groups_clean

In [None]:
# Check names of chromosomes and chromosomes to exclude from the analysis
adata.var.chromosome.unique()

In [None]:
chr_to_exclude = [np.nan, 'chrX', 'chrY', 'MT', 'GL000194.1']

In [None]:
# dictionary of dataframes for keeping the inferCNV data
df_dic = {}                                                                 
for cell in groups_clean:    
    data = adata[adata.obs[selected_column] == cell]
    print(cell, data.shape)
    cnv.tl.infercnv(data, reference_key="Sample", reference_cat=["Control1", "Control2", "Father", "Mother"], 
                    exclude_chromosomes=chr_to_exclude, window_size=window, step=1)    
    cnv.pl.chromosome_heatmap(data, groupby="Sample")
    df = pd.DataFrame.sparse.from_spmatrix(data.obsm['X_cnv'], index=data.obs.index)
    df_dic[cell] = df

In [None]:
# Concatenate all celltypes in a single dataframe
print(range(1, len(groups_clean)))
df = df_dic[groups_clean[0]]
df["celltype"] = groups_clean[0]
for n in range(1, len(groups_clean)):
    df_new = df_dic[groups_clean[n]]
    df_new["celltype"] = groups_clean[n]
    df = pd.concat([df, df_new])
    print(df.shape)
print(df.shape[0], "cells in total")
df.head(3)

## Calculate average inferCNV values per chromosome

In [None]:
# no null values
df.isna().sum(axis=0).sum()

In [None]:
# Number of genes read per chromosome
adata.var.chromosome.value_counts()

In [None]:
# number of genes per chromosome
dic_chr = {"chr1": 1902, "chr2": 1244, "chr3": 1086, "chr4": 669, "chr5": 830, "chr6": 976,
                 "chr7": 910, "chr8": 604, "chr9": 708, "chr10": 711, "chr11": 1035, "chr12": 965,
                 "chr13": 360, "chr14": 693, "chr15": 572, "chr16": 815, "chr17": 1081, "chr18": 275, "chr19": 1322,
                 "chr20": 482, "chr21": 210, "chr22": 480}

# gene index per chromosome
dic_genes_chr = {"chr1": [0, 1902], "chr2": [1903, 3146], "chr3": [3147, 4232], "chr4": [4233, 4901], "chr5": [4902, 5731], 
                 "chr6": [5732, 6707], "chr7": [6708, 7617], "chr8": [7618, 8221], "chr9": [8222, 8929], 
                 "chr10": [8930, 9640], "chr11": [9641, 10675], "chr12": [10676, 11640], "chr13": [11641, 12000],
                 "chr14": [12001, 12693], "chr15": [12694, 13265], "chr16": [13266, 14080], 
                 "chr17": [14081, 15161], "chr18": [15162, 15436], "chr19": [15437, 16758],
                 "chr20": [16759, 17240], "chr21": [17241, 17450], "chr22": [17451, 17930]}


In [None]:
# Calculate the mean CNV score values per chromosome per cell
for key, value in dic_genes_chr.items():
    #print(key, value[0], value[1])
    df[key] = df.iloc[:, value[0]: value[1]].mean(axis=1)

# Keep only the mean data per chromosome
df_means = df.loc[:, "celltype":"chr22"].copy()
print(df_means.shape)
df_means.head(3)

In [None]:
# Plot averages
df_cells = df_means.loc[:, ["celltype"]].copy()
df_temp = df_means.loc[:, "chr1":"chr22"].copy()
plt.figure(figsize = (8,4))
g = sns.heatmap(df_temp, cmap="bwr", vmin=-0.05, vmax=0.05)
g.set_yticklabels("")
g.set_title("All Cells")
plt.show()

In [None]:
# Plot averages in a clustermap
df_cells = df_means.loc[:, ["celltype"]].copy()
df_temp = df_means.loc[:, "chr1":"chr22"].copy()
plt.figure(figsize = (8,4))
g = sns.clustermap(df_temp, cmap="bwr", vmin=-0.05, vmax=0.05)
#g.set_yticklabels("")
g.set_title("All Cells")
plt.show()

In [None]:
# Plot averages in a clustermap grouping per celltypes
df_temp = df_means.copy()
celltypes = df_temp.pop("celltype")
lut = dict(zip(celltypes.unique(), "rbg"))
row_colors = celltypes.map(lut)

In [None]:
plt.figure(figsize = (8,4))
g = sns.clustermap(df_temp, cmap="bwr", vmin=-0.05, vmax=0.05, row_colors=row_colors)
g.set_title("All Cells")
plt.show()

In [None]:
for celltype in groups_clean:
    print(celltype)
    df_temp = df_means[df_means["celltype"] == celltype]
    ct = df_temp.pop("celltype")
    g = sns.clustermap(df_temp, cmap="bwr", vmin=-0.05, vmax=0.05, figsize=(8,4),
                      col_cluster=False, yticklabels=False, cbar_pos=None)
    plt.show()

In [None]:
# Add Sample data
df_means = adata.obs[["Sample"]].merge(df_means, left_index=True, right_index=True)
df_means.head(3)

In [None]:
# Remove cell types with few cells
groups_clean2 = [x for x in groups if not (x=="ASDC_mDC") | (x=="CD8 TEM_4") | (x=="NK_1") |
               (x=="CD4 Proliferating") | (x=="Plasmablast")]
groups_clean2

In [None]:
for celltype in groups_clean2:
    print(celltype)
    df_temp = df_means[df_means["celltype"] == celltype]
    df_temp_controls = df_temp[(df_temp["Sample"]=="Control1") | (df_temp["Sample"]=="Control2")]
    df_plot = df_temp_controls.loc[:, "chr1":"chr22"].copy()
    print("Controls")
    g = sns.clustermap(df_plot, cmap="bwr", vmin=-0.05, vmax=0.05, figsize=(8,4),
                      col_cluster=False, yticklabels=False, cbar_pos=None)
    plt.show()
    
    df_temp_proband = df_temp[df_temp["Sample"]=="Proband"]
    print("Proband")
    df_plot = df_temp_proband.loc[:, "chr1":"chr22"].copy()
    g = sns.clustermap(df_plot, cmap="bwr", vmin=-0.05, vmax=0.05, figsize=(8,4),
                      col_cluster=False, yticklabels=False, cbar_pos=None)
    plt.show()

## Scale means per chromosome using controls as references

In [None]:
df_means.head(3)

In [None]:
df_means_sample_celltypes = df_means.loc[:, ["Sample","celltype"]].copy()
df_temp = df_means.loc[:, "chr1":"chr22"].copy()

df_means_controls = df_means[(df_means["Sample"]=="Control1") | (df_means["Sample"]=="Control2")]
df_temp_controls = df_means_controls.loc[:, "chr1":"chr22"].copy()

In [None]:
# Scale from -1 to 1, centered in 0
def zero_centered_min_max_scaling_controls(df_temp, df_temp_controls):
    """
    Scale the numerical values in the dataframe to be between -1 and 1, preserving the
    signal of all values. Use only controls as reference for the scaling
    """
    df = df_temp.copy(deep=True)
    for column in df.columns:
        max_absolute_value = df_temp_controls[column].abs().max()
        df[column] = df[column] / max_absolute_value 
    return df

In [None]:
df_means_scaled_controls = zero_centered_min_max_scaling_controls(df_temp, df_temp_controls)
df_means_scaled_controls = df_means_sample_celltypes.merge(df_means_scaled_controls, left_index=True, right_index=True)
df_means_scaled_controls.head(3)

In [None]:
# Plot scaled averages
df_temp = df_means_scaled_controls.loc[:, "chr1":"chr22"].copy()
plt.figure(figsize = (8,4))
g = sns.heatmap(df_temp, cmap="bwr")
g.set_yticklabels("")
g.set_title("All Celltypes")
plt.show()

In [None]:
df_temp = df_means_scaled_controls[df_means_scaled_controls["celltype"] == "B intermediate kappa"]
print(df_temp.shape)
df_temp = df_temp.loc[:, "chr1":"chr22"].copy()
plt.figure(figsize = (8,4))
g = sns.heatmap(df_temp, cmap="bwr", vmin=-1, vmax=1)
g.set_yticklabels("")
g.set_title("B intermediate kappa")
plt.show()

In [None]:
df_temp = df_means_scaled_controls[df_means_scaled_controls["celltype"] == "gdT_2"]
print(df_temp.shape)
df_temp = df_temp.loc[:, "chr1":"chr22"].copy()
plt.figure(figsize = (8,4))
g = sns.heatmap(df_temp, cmap="bwr", vmin=-1, vmax=1)
g.set_yticklabels("")
g.set_title("gdT_2")
plt.show()

In [None]:
for celltype in groups_clean2:
    print(celltype)
    df_temp = df_means_scaled_controls[df_means_scaled_controls["celltype"] == celltype]
    
    #Controls
    df_temp_controls = df_temp[(df_temp["Sample"]=="Control1") | (df_temp["Sample"]=="Control2")]
    df_plot = df_temp_controls.loc[:, "chr1":"chr22"].copy()
    print("Controls")
    g = sns.clustermap(df_plot, cmap="bwr", vmin=-1.2, vmax=1.2, figsize=(8,3),
                      col_cluster=False, yticklabels=False, cbar_pos=None)
    plt.show()
    
    #Proband
    df_temp_proband = df_temp[df_temp["Sample"]=="Proband"]
    print("Proband")
    df_plot = df_temp_proband.loc[:, "chr1":"chr22"].copy()
    g = sns.clustermap(df_plot, cmap="bwr", vmin=-1.2, vmax=1.2, figsize=(8,3),
                      col_cluster=False, yticklabels=False, cbar_pos=None)
    plt.show()

In [None]:
### Save df_means to avoid re-running inferCNV
df_means.to_csv(DESKTOP + "220109_L3_df_means_251_allControls.tsv", sep="\t")
df_means_scaled_controls.to_csv(DESKTOP + "220109_L3_df_means_251_allControls_scaled_controls.tsv", sep="\t")

## Plots in specific cell types

In [None]:
df_B = df_means_scaled_controls[df_means_scaled_controls["celltype"] == "B intermediate lambda"]
df_B_controls = df_B[(df_B["Sample"]=="Control1") | (df_B["Sample"]=="Control2")]
df_B_proband  = df_B[df_B["Sample"]=="Proband"]

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_B_controls.chr12.plot.hist(bins=300, ylim=(0,5), xlim=(-1.5,2))
df_B_proband.chr12.plot.hist(bins=300, ylim=(0,5), xlim=(-1.5,2))
plt.title("Chr12")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_B_controls.chr12.plot.kde(ylim=(0,3), xlim=(-1.5,2))
df_B_proband.chr12.plot.kde(ylim=(0,3), xlim=(-1.5,2))
plt.title("Chr12")
plt.show()

In [None]:
for n in range(1,23):
    chr = "chr" + str(n)
    fix, ax = plt.subplots(figsize=(6,3))
    df_B_controls[chr].plot.kde(ylim=(0,5), xlim=(-1.5,1.5))
    df_B_proband[chr].plot.kde(ylim=(0,5), xlim=(-1.5,1.5))
    plt.title("B lymphocytes " + chr)
    plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_B_controls[chr].plot.kde(ylim=(0,6), xlim=(-1.5,1.5))
plt.title("B intermediate lambda - Controls")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_B_proband[chr].plot.kde(ylim=(0,6), xlim=(-1.5,1.5))
plt.title("B intermediate lambda - Proband")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_means[chr].plot.kde(ylim=(0,30), xlim=(-0.1,0.1))
plt.title("All celltypes")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_means_scaled_controls[chr].plot.kde(ylim=(0,8), xlim=(-1,1))
plt.title("All celltypes")
plt.show()

## Plots in other T

In [None]:
df_otherT = df_means_scaled_controls[df_means_scaled_controls["celltype"] == "gdT_4"]
df_otherT_controls = df_otherT[(df_otherT["Sample"]=="Control1") | (df_otherT["Sample"]=="Control2")]
df_otherT_proband  = df_otherT[df_otherT["Sample"]=="Proband"]
df_otherT.head(3)

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_otherT_controls.chr12.plot.kde(ylim=(0,3), xlim=(-1.5,1.5))
df_otherT_proband.chr12.plot.kde(ylim=(0,3), xlim=(-1.5,1.5))
plt.title("Chr12")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_otherT_controls.chr18.plot.kde(ylim=(0,3), xlim=(-1.5,1.5))
df_otherT_proband.chr18.plot.kde(ylim=(0,3), xlim=(-1.5,1.5))
plt.title("Chr18")
plt.show()

## Classify Aneuploid cells based on data Scaled_on_Controls

In [None]:
threshold = 0.5       

chr_list = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
            'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
            'chr18', 'chr19', 'chr20', 'chr21', 'chr22']

for chr in chr_list:
    new_gain = chr + "_gain"
    df_means_scaled_controls[new_gain] = np.where(df_means_scaled_controls[chr] > threshold, "Y", "N")
    
    new_loss = chr + "_loss"
    df_means_scaled_controls[new_loss] = np.where(df_means_scaled_controls[chr] < -threshold, "Y", "N")
    
df_means_scaled_controls.head(3)

In [None]:
# Classify as GAIN: Y/N if any chromosome gained
df_means_scaled_controls["Gain"] = np.where(df_means_scaled_controls.loc[:, "chr1_gain": "chr22_gain"].ne("N").any(axis=1), "Y", "N")
df_means_scaled_controls["Loss"] = np.where(df_means_scaled_controls.loc[:, "chr1_loss": "chr22_loss"].ne("N").any(axis=1), "Y", "N")
df_means_scaled_controls["Aneu"] = np.where(df_means_scaled_controls.loc[:, "Gain": "Loss"].ne("N").any(axis=1), "Aneuploid", "Euploid")
df_means_scaled_controls

### Classify CleanAneuploid vs CleanEU cells based on scaled data

In [None]:
thresholdEU = 0.25  
thresholdAneu = 0.65

chr_list = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
            'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
            'chr18', 'chr19', 'chr20', 'chr21', 'chr22']

for chr in chr_list:
    new_gain = chr + "_CleanGain"
    df_means_scaled_controls[new_gain] = np.where(df_means_scaled_controls[chr] > thresholdAneu, "Y", "N")
    
for chr in chr_list:
    new_loss = chr + "_CleanLoss"
    df_means_scaled_controls[new_loss] = np.where(df_means_scaled_controls[chr] < -thresholdAneu, "Y", "N")
    
for chr in chr_list:
    new_EU = chr + "_CleanEU"
    df_means_scaled_controls[new_EU] = np.where((df_means_scaled_controls[chr] > -thresholdEU) & 
                                        (df_means_scaled_controls[chr] < thresholdEU), "Y", "N")

df_means_scaled_controls

In [None]:
# Classify as GAIN: Y/N if any chromosome gained

df_means_scaled_controls["CleanGain"] = np.where(df_means_scaled_controls.loc[:, "chr1_CleanGain": "chr22_CleanGain"].ne("N").any(axis=1), "Y", "N")
df_means_scaled_controls["CleanLoss"] = np.where(df_means_scaled_controls.loc[:, "chr1_CleanLoss": "chr22_CleanLoss"].ne("N").any(axis=1), "Y", "N")
df_means_scaled_controls["CleanAneu"] = np.where(df_means_scaled_controls.loc[:, "chr1_CleanEU": "chr22_CleanEU"].ne("N").all(axis=1), "CleanEU",
                                        np.where(df_means_scaled_controls.loc[:, "CleanGain": "CleanLoss"].ne("N").any(axis=1), 
                                                 "CleanAneu", "Dirty")
                                )
df_means_scaled_controls

In [None]:
df_means_scaled_controls.groupby("Sample")["Aneu"].value_counts(normalize=True)

In [None]:
df_means_scaled_controls.groupby("celltype")["Aneu"].value_counts(normalize=True)

In [None]:
df_means_scaled_controls.groupby("Sample")["CleanAneu"].value_counts(normalize=True)

In [None]:
df_means_scaled_controls.groupby("celltype")["CleanAneu"].value_counts(normalize=True)

In [None]:
df_means_scaled_controls

In [None]:
new_obs = df_means_scaled_controls.copy()
new_obs.pop("Sample")
new_obs.pop("celltype")
new_obs.head(3)

In [None]:
# Merge into adata.obs
adata.obs = adata.obs.merge(new_obs, how="left", left_index=True, right_index=True)

In [None]:
# 211126_MAD1_scRNAseq_L2.h5ad  = 211126_MAD1_scRNAseq_L2_w251_t0.5_v2.5-6.5_scaled_controls.h5ad
adata.obs.to_csv(DESKTOP + "220109_cell_metadata_L3_w251_t0.5_v2.5-6.5_allControls_scaled_controls.tsv", sep="\t")
#adata.write(DESKTOP + "211126_MAD1_scRNAseq_L3_w251_t0.5_v2.5-6.5_scaled_controls.h5ad")

# Repeat all with non_scaled data

## Plots in specific cell types

In [None]:
df_B = df_means[df_means["celltype"] == "B intermediate lambda"]
df_B_controls = df_B[(df_B["Sample"]=="Control1") | (df_B["Sample"]=="Control2")]
df_B_proband  = df_B[df_B["Sample"]=="Proband"]

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_B_controls.chr12.plot.hist(bins=300, ylim=(0,5), xlim=(-0.2,0.2))
df_B_proband.chr12.plot.hist(bins=300, ylim=(0,5), xlim=(-0.2,0.2))
plt.title("Chr12")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_B_controls.chr12.plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
df_B_proband.chr12.plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
plt.title("Chr12")
plt.show()

In [None]:
for n in range(1,23):
    chr = "chr" + str(n)
    fix, ax = plt.subplots(figsize=(6,3))
    df_B_controls[chr].plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
    df_B_proband[chr].plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
    plt.title("B lymphocytes " + chr)
    plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_B_controls[chr].plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
plt.title("B intermediate lambda - Controls")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_B_proband[chr].plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
plt.title("B intermediate lambda - Proband")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_means[chr].plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
plt.title("All celltypes")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
for n in range(1,23):
    chr = "chr" + str(n)
    df_means_scaled_controls[chr].plot.kde(ylim=(0,15), xlim=(-1,1))
plt.title("All celltypes")
plt.show()

## Plots in other T

In [None]:
df_otherT = df_means[df_means["celltype"] == "gdT_4"]
df_otherT_controls = df_otherT[(df_otherT["Sample"]=="Control1") | (df_otherT["Sample"]=="Control2")]
df_otherT_proband  = df_otherT[df_otherT["Sample"]=="Proband"]
df_otherT.head(3)

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_otherT_controls.chr12.plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
df_otherT_proband.chr12.plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
plt.title("Chr12")
plt.show()

In [None]:
fix, ax = plt.subplots(figsize=(6,3))
df_otherT_controls.chr18.plot.kde(ylim=(0,100), xlim=(-0.1,0.1))
df_otherT_proband.chr18.plot.kde(ylim=(0,80), xlim=(-0.1,0.1))
plt.title("Chr18")
plt.show()

## Classify Aneuploid cells based on non-scaled data

In [None]:
df_means = pd.read_csv(DESKTOP + "220109_L3_df_means_251_allControls.tsv", sep="\t")

In [None]:
threshold = 0.025       

chr_list = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
            'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
            'chr18', 'chr19', 'chr20', 'chr21', 'chr22']

for chr in chr_list:
    new_gain = chr + "_gain"
    df_means[new_gain] = np.where(df_means[chr] > threshold, "Y", "N")
    
    new_loss = chr + "_loss"
    df_means[new_loss] = np.where(df_means[chr] < -threshold, "Y", "N")
    
df_means.head(3)

In [None]:
# Classify as GAIN: Y/N if any chromosome gained
df_means["Gain"] = np.where(df_means.loc[:, "chr1_gain": "chr22_gain"].ne("N").any(axis=1), "Y", "N")
df_means["Loss"] = np.where(df_means.loc[:, "chr1_loss": "chr22_loss"].ne("N").any(axis=1), "Y", "N")
df_means["Aneu"] = np.where(df_means.loc[:, "Gain": "Loss"].ne("N").any(axis=1), "Aneuploid", "Euploid")
df_means

### Classify CleanAneuploid vs CleanEU cells based on non-scaled data

In [None]:
thresholdEU = 0.01
thresholdAneu = 0.035

chr_list = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9',
            'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 
            'chr18', 'chr19', 'chr20', 'chr21', 'chr22']

for chr in chr_list:
    new_gain = chr + "_CleanGain"
    df_means[new_gain] = np.where(df_means[chr] > thresholdAneu, "Y", "N")
    
for chr in chr_list:
    new_loss = chr + "_CleanLoss"
    df_means[new_loss] = np.where(df_means[chr] < -thresholdAneu, "Y", "N")
    
for chr in chr_list:
    new_EU = chr + "_CleanEU"
    df_means[new_EU] = np.where((df_means[chr] > -thresholdEU) & 
                                        (df_means[chr] < thresholdEU), "Y", "N")

df_means

In [None]:
# Classify as GAIN: Y/N if any chromosome gained

df_means["CleanGain"] = np.where(df_means.loc[:, "chr1_CleanGain": "chr22_CleanGain"].ne("N").any(axis=1), "Y", "N")
df_means["CleanLoss"] = np.where(df_means.loc[:, "chr1_CleanLoss": "chr22_CleanLoss"].ne("N").any(axis=1), "Y", "N")
df_means["CleanAneu"] = np.where(df_means.loc[:, "chr1_CleanEU": "chr22_CleanEU"].ne("N").all(axis=1), "CleanEU",
                                        np.where(df_means.loc[:, "CleanGain": "CleanLoss"].ne("N").any(axis=1), 
                                                 "CleanAneu", "Dirty")
                                )
df_means

In [None]:
df_means.groupby("Sample")["Aneu"].value_counts(normalize=True)

In [None]:
df_means.groupby("celltype")["Aneu"].value_counts(normalize=True)

In [None]:
df_means.groupby("Sample")["CleanAneu"].value_counts(normalize=True)

In [None]:
df_means.groupby("celltype")["CleanAneu"].value_counts(normalize=True)

In [None]:
df_means

In [None]:
new_obs = df_means.copy()
new_obs.pop("Sample")
new_obs.pop("celltype")
#new_obs.set_index("Cell", inplace=True)
new_obs.head(3)

In [None]:
# Merge into adata.obs
adata.obs = adata.obs.merge(new_obs, how="left", left_index=True, right_index=True)

In [None]:
# 211126_MAD1_scRNAseq_L2.h5ad  = 211126_MAD1_scRNAseq_L2_w251_t0.5_v2.5-6.5_scaled_controls.h5ad
adata.obs.to_csv(DESKTOP + "220109_cell_metadata_L3_w251_t0.5_v2.5-6.5_allControls_non_scaled.tsv", sep="\t")
#adata.write(DESKTOP + "211126_MAD1_scRNAseq_L3_w251_t0.5_v2.5-6.5_scaled_controls.h5ad")