In [None]:
"""Contains function for importing and handling knockout RNAseq data"""
from os.path import join, dirname
import pandas as pd

def importRNAseqKO():
    data = pd.read_csv("data/rpmCounts_allRuns_matrix.tsv.xz", index_col="GeneSymbol", delim_whitespace = True)
    data = data.drop(["gene_id"], axis=1) # using GeneSymbol as index
    data = data.dropna(axis=1) # remove columns with no measurements
    KO_genes = list(list(zip(*data.columns.str.split("-")))[0])
    data.columns = KO_genes
    return data

In [None]:
data = importRNAseqKO()
data_combined = data.groupby(by=data.columns, axis=1).mean()
data_combined = data_combined.groupby(["GeneSymbol"]).max()
data_combined['neg'] = data_combined[['neg01', 'neg10']].mean(axis=1)
for i in range(1, 10):
    data_combined = data_combined.drop(["neg0"+str(i)], axis=1)
data_combined = data_combined.drop(["neg10"], axis=1)
data_combined = data_combined.drop(["WT"], axis=1)

In [None]:
data_combined.columns

In [None]:
data_combined.loc["FOSL1","FOSL1"]

In [None]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
import numpy as np
%matplotlib inline

#------------------------- Perform PCA using sklearn
def pca(data,num_components):
    "Function takes in parameter for number of components. Returns list containing: [PCA object, fitted model]"
    pca = PCA(n_components=num_components)
    X_r = pca.fit(normalize(data)).transform(data)
    return [pca, X_r]

#-------------------------- Calculate cumulative variance based on number of PCs included and create r2x plot
def r2x(num_components, pca):
    """Function creates r2x plot comparing number of components to % variance captured.
    Parameters include: number of components, pca object from pca()"""
    total_variance = np.array([])
    tot = 0.0
    for i in range(0,num_components):
        tot += pca.explained_variance_ratio_[i]
        total_variance = np.append(total_variance, tot)

    fig = plt.figure(figsize = (8,8))
    ax = plt.subplot(111) 
    ax.set_xlabel("Number of PCs", fontsize = 15)
    ax.set_ylabel("% Variance", fontsize = 15)
    plt.xticks(np.arange(num_components+1))
    plt.plot(list(range(1,num_components+1)),total_variance)
    ax.grid()
    plt.title("r2x Plot", fontsize = 18)
    

In [None]:
pca_list = pca(data.T,10)
r2x(10, pca_list[0])
X_r = pca_list[1]
KO_genes = data.columns
KO_genes_unique = list(set(KO_genes))
df = pd.DataFrame(X_r)
df["KO Gene"] = KO_genes

In [None]:
import matplotlib.colors as colors
import matplotlib.cm as cmx

#-------------------------- Create PC plots

# Set the color map to match the number of species
z = range(1,len(KO_genes_unique))
rainbow = plt.get_cmap('rainbow')
cNorm  = colors.Normalize(vmin=0, vmax=len(KO_genes_unique))
scalarMap = cmx.ScalarMappable(norm=cNorm, cmap=rainbow)

fig = plt.figure(figsize=(50,40))
# PC1 vs PC2
ax = plt.subplot(231)
ax.set_xlabel("PC1", fontsize = 15)
ax.set_ylabel("PC2", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,0][indx], df.iloc[:,1][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,0], df.iloc[i,1]), fontsize=6)

# PC1 vs PC3
ax = plt.subplot(232)
ax.set_xlabel("PC1", fontsize = 15)
ax.set_ylabel("PC3", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,0][indx], df.iloc[:,2][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,0], df.iloc[i,2]), fontsize=6)

# PC1 vs PC4
ax = plt.subplot(233)
ax.set_xlabel("PC1", fontsize = 15)
ax.set_ylabel("PC4", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,0][indx], df.iloc[:,3][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,0], df.iloc[i,3]), fontsize=6)

# PC2 vs PC3
ax = plt.subplot(234)
ax.set_xlabel("PC2", fontsize = 15)
ax.set_ylabel("PC3", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,1][indx], df.iloc[:,2][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,1], df.iloc[i,2]), fontsize=6)

# PC2 vs PC4
ax = plt.subplot(235)
ax.set_xlabel("PC1", fontsize = 15)
ax.set_ylabel("PC3", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,1][indx], df.iloc[:,3][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,1], df.iloc[i,3]), fontsize=6)

# PC3 vs PC4
ax = plt.subplot(236)
ax.set_xlabel("PC3", fontsize = 15)
ax.set_ylabel("PC4", fontsize = 15)
for i in range(len(KO_genes_unique)):
    indx = df["KO Gene"] == KO_genes_unique[i]
    plt.scatter(df.iloc[:,2][indx], df.iloc[:,3][indx], s=10, color=scalarMap.to_rgba(i), label=KO_genes_unique[i])
ax.grid()
for i, txt in enumerate(KO_genes):
    ax.annotate(txt, (df.iloc[i,2], df.iloc[i,3]), fontsize=6)

In [None]:
#-------------------------- Plot individual knockouts for PC1 vs PC2
fig, axes = plt.subplots(nrows=47, ncols=2,figsize=(30,180))
fig.suptitle('KO Genes PC1 v PC2')
for ax, name in zip(axes.flatten(), KO_genes_unique):
    indx = df["KO Gene"] == name
    ax.scatter(df.iloc[:,0][indx], df.iloc[:,1][indx], s=30)
    ax.grid()
    ax.set_title(name)
    ax.set_xlim((-20000,7000))
    ax.set_ylim((-25000,18000))
fig = ax.get_figure()
fig.text(0.5, 0.04, 'PC1', ha='center', va='center')
fig.text(0.06, 0.5, 'PC2', ha='center', va='center', rotation='vertical')
fig.tight_layout()
fig.subplots_adjust(top=0.975)
#plt.savefig("PC2vPC1_all.png")

In [None]:
#-------------------------- Analyze spread of replicates' PC scores
PC_KO_spreads = pd.DataFrame(KO_genes_unique, columns=["KO GeneSymbol"])
PC_KO_spreads["Number of Replicates"] = 0
PC_KO_spreads["PC1 Spread"] = 0
PC_KO_spreads["PC2 Spread"] = 0
PC_KO_spreads["PC3 Spread"] = 0
PC_KO_spreads["PC4 Spread"] = 0
PC_KO_spreads["PC5 Spread"] = 0
PC_KO_spreads["PC6 Spread"] = 0
PC_KO_spreads["PC7 Spread"] = 0
PC_KO_spreads["PC8 Spread"] = 0
PC_KO_spreads["PC9 Spread"] = 0
PC_KO_spreads["PC10 Spread"] = 0
PC_KO_spreads = PC_KO_spreads.set_index("KO GeneSymbol")
df

In [None]:
df = df.set_index("KO Gene")
for i in range(1, 11):
    for gene in KO_genes_unique:
        PC_KO_spreads.loc[gene, "PC"+str(i)+" Spread"] = df.loc[gene, (i-1)].max()-df.loc[gene, (i-1)].min()
        PC_KO_spreads.loc[gene, "Number of Replicates"] = df.loc[[gene]].shape[0]

# Remove all single replicate knockout genes and WT
mask = (PC_KO_spreads["Number of Replicates"] > 1)  & (PC_KO_spreads["Number of Replicates"] < 10)
PC_KO_spreads = PC_KO_spreads[mask] 
# Display boxplots of PC spread
boxplot = PC_KO_spreads.boxplot(column=["PC1 Spread", "PC2 Spread", "PC3 Spread", "PC4 Spread",
                                       "PC5 Spread", "PC6 Spread", "PC7 Spread", "PC8 Spread",
                                       "PC9 Spread", "PC10 Spread"], figsize=(12, 8))

In [None]:
# Calculate 75th percentile for each PC
Q3_1 = PC_KO_spreads["PC1 Spread"].quantile(0.75)
Q3_2 = PC_KO_spreads["PC2 Spread"].quantile(0.75)
Q3_3 = PC_KO_spreads["PC3 Spread"].quantile(0.75)
Q3_4 = PC_KO_spreads["PC4 Spread"].quantile(0.75)

# Find genes with spread above 75th quartile for all 4 PCs:
PC_KO_spreads.loc[(PC_KO_spreads["PC1 Spread"] > Q3_1) & (PC_KO_spreads["PC2 Spread"] > Q3_2)
                  & (PC_KO_spreads["PC3 Spread"] > Q3_3) & (PC_KO_spreads["PC4 Spread"] > Q3_4)]
        # Genes: THAP11, INO80, BUB1B, KDM1A

# Find genes with spread above 75th quartile for at least 3 PCs:
PC_KO_spreads.loc[((PC_KO_spreads["PC1 Spread"] > Q3_1) & (PC_KO_spreads["PC2 Spread"] > Q3_2)
                  & (PC_KO_spreads["PC3 Spread"] > Q3_3)) | ((PC_KO_spreads["PC4 Spread"] > Q3_4)
                  & (PC_KO_spreads["PC2 Spread"] > Q3_2) & (PC_KO_spreads["PC3 Spread"] > Q3_3))
                  | ((PC_KO_spreads["PC1 Spread"] > Q3_1) & (PC_KO_spreads["PC2 Spread"] > Q3_2)
                  & (PC_KO_spreads["PC4 Spread"] > Q3_4)) | ((PC_KO_spreads["PC1 Spread"] > Q3_1)
                  & (PC_KO_spreads["PC3 Spread"] > Q3_3) & (PC_KO_spreads["PC4 Spread"] > Q3_4))]
        # Genes: MAPK1, NFYA, MITF, PLK1, SRF, THAP11, ZFX, SOX10, ZBTB17, INO80, PRPF4B, BUB1B, KDM1A

# Calculate 25th percentile for each PC
Q1_1 = PC_KO_spreads["PC1 Spread"].quantile(0.25)
Q1_2 = PC_KO_spreads["PC2 Spread"].quantile(0.25)
Q1_3 = PC_KO_spreads["PC3 Spread"].quantile(0.25)
Q1_4 = PC_KO_spreads["PC4 Spread"].quantile(0.25)

# Find genes with spread below 25th quartile for all 4 PCs:
PC_KO_spreads.loc[(PC_KO_spreads["PC1 Spread"] < Q1_1) & (PC_KO_spreads["PC2 Spread"] < Q1_2)
                  & (PC_KO_spreads["PC3 Spread"] < Q1_3) & (PC_KO_spreads["PC4 Spread"] < Q1_4)]
        # Genes: VDR, SETD1B, LHX1, MAPKAPK2, DEAF1, JUNB, KDM2A, GATA4
    
# Sum spread over all PCs and sort
PC_KO_spreads["Spread Sum"] = PC_KO_spreads.sum(axis=1) - PC_KO_spreads["Number of Replicates"]
PC_KO_spreads = PC_KO_spreads.sort_values(by=["Spread Sum"], ascending=False)
PC_KO_spreads

In [None]:
#-------------------------- PCA with replicates averaged together
pca_list_combined = pca(data_combined.T,10)
r2x(10, pca_list_combined[0])
X_r_combined = pca_list_combined[1]

In [None]:
print("Replicates: ", pca_list[0].explained_variance_ratio_)
print("Averaged: ", pca_list_combined[0].explained_variance_ratio_)

In [None]:
X_r_combined.shape

In [None]:
data_combined

In [None]:
print(data_combined.columns)

In [None]:
data_combined = data_combined.sort_index()

In [None]:
matrix = pd.DataFrame()
for i, gene in enumerate(data_combined.columns):
    if gene != "neg":
        matrix[gene] = data_combined.loc[gene,:]
matrix = matrix.T
matrix

In [None]:
restart the kernel and clear all outputs