This notebook aims to find genes that have the strongest spearman correlation (positive or negative) with gene SSMa053560.

In [3]:
import pandas as pd
from utils.misc import extract_number
import numpy as np

In [4]:
# read in the expression dataframe
gene_expressions = pd.read_csv("data/train_data.csv", index_col=0)
gene_expressions_mat = gene_expressions.to_numpy()
genenames = np.array(gene_expressions.index.tolist())
samples = gene_expressions.columns.tolist()

# extract ages
ages = np.array([extract_number(timestring) for timestring in samples])
unique_ages = np.unique(ages)

# retain genes that are present in all samples
prevalence = np.mean(gene_expressions_mat > 0, axis=1)
subset_gene_id = np.where(prevalence == 1)[0]
subset_genenames = genenames[subset_gene_id]
gene_expressions = gene_expressions.loc[subset_genenames, :]
gene_expressions_mat = gene_expressions_mat[subset_gene_id, :]
gene_expressions = gene_expressions.T
gene_expressions_mat = gene_expressions_mat.T

In [5]:
geneexp_053560 = gene_expressions.loc[:, "SSMa053560"].to_numpy()

In [6]:
correlations_df = pd.DataFrame({"Pearson": np.zeros(gene_expressions.shape[1]),
                                "Spearman": np.zeros(gene_expressions.shape[1])})
correlations_df.index = subset_genenames

In [7]:
from scipy.stats import pearsonr, spearmanr, rankdata

In [8]:
for gname in subset_genenames:
    selected_expression = gene_expressions.loc[:, gname].to_numpy()
    pcorr, _ = pearsonr(selected_expression, geneexp_053560)
    scorr, _ = spearmanr(selected_expression, geneexp_053560)
    correlations_df.loc[gname, "Pearson"] = pcorr
    correlations_df.loc[gname, "Spearman"] = scorr

In [9]:
correlations_df = correlations_df.sort_values('Spearman')
negative_corr = correlations_df.head(10)
positive_corr = correlations_df.tail(10)

In [10]:
correlations_df.to_csv("gene_plots/corr_SSMa053560/correlations.csv")

In [11]:
from utils.viz import single_line_plot, single_scatter_plot
age_ranks = rankdata(ages, method='min')

In [12]:
import matplotlib.pyplot as plt

In [13]:
for j in range(10):
    genename=negative_corr.index[j]
    expression = gene_expressions.loc[:, genename].to_numpy()
    fig = single_scatter_plot(ymat = expression.reshape(1, -1),
                              xticks=age_ranks, xticknames=ages.astype('str'),
                              xname="Age (Months)", yname="Gene Expression (TPM)",
                              title=f"negative_{genename}")
    fig.savefig(f"gene_plots/corr_SSMa053560/negative_{genename}.pdf", bbox_inches="tight")
    plt.close(fig)

In [14]:
for j in range(10):
    genename=positive_corr.index[j]
    expression = gene_expressions.loc[:, genename].to_numpy()
    fig = single_scatter_plot(ymat = expression.reshape(1, -1),
                              xticks=age_ranks, xticknames=ages.astype('str'),
                              xname="Age (Months)", yname="Gene Expression (TPM)",
                              title=f"positive_{genename}")
    fig.savefig(f"gene_plots/corr_SSMa053560/positive_{genename}.pdf", bbox_inches="tight")
    plt.close(fig)