The outputs of this Notebook are contingency tables, where rows are Sfari (core gene) clusters, while columns are full clusters. Each cell in the table contains the number of genes in the intersection of these core and full clusters.

Cramer's V values are calculated at the end of the notebook

In [1]:
import pandas as pd
import numpy as np

In [2]:
def expected_m(a):
    e = np.zeros(a.shape)
    for i in range(e.shape[0]):
        for j in range(e.shape[1]):
            e[i][j] = (np.sum(a, axis=1)[i])*(np.sum(a, axis=0)[j])/(np.sum(a))
    return e

def chisq (a, e):
    return (np.sum(np.square(a-e)/e))

def cramersv (a, a_chi):
    r = a.shape[0]
    k = a.shape[1]
    n = np.sum(a)
    num = a_chi/n
    denom = np.minimum(r-1, k-1)
    return(np.sqrt(num/denom))

Brain

In [None]:
# reading full brain clusters

b_brain_0 = pd.read_csv("full_giant_brain_louvain/giant-0.txt", names=["entrez"])
b_brain_1 = pd.read_csv("full_giant_brain_louvain/giant-1.txt", names=["entrez"])
b_brain_2 = pd.read_csv("full_giant_brain_louvain/giant-2.txt", names=["entrez"])
b_brain_3 = pd.read_csv("full_giant_brain_louvain/giant-3.txt", names=["entrez"])
b_brain_4 = pd.read_csv("full_giant_brain_louvain/giant-4.txt", names=["entrez"])
b_brain_5 = pd.read_csv("full_giant_brain_louvain/giant-5.txt", names=["entrez"])
b_brain_6 = pd.read_csv("full_giant_brain_louvain/giant-6.txt", names=["entrez"])

b_brain_0["cid"] = 0
b_brain_1["cid"] = 1
b_brain_2["cid"] = 2
b_brain_3["cid"] = 3
b_brain_4["cid"] = 4
b_brain_5["cid"] = 5
b_brain_6["cid"] = 6

print(b_brain_0.shape)
print(b_brain_1.shape)
print(b_brain_2.shape)
print(b_brain_3.shape)
print(b_brain_4.shape)
print(b_brain_5.shape)
print(b_brain_6.shape)

b_brain_list = [b_brain_0, b_brain_1, b_brain_2, b_brain_3]
b_brain_all = pd.concat(b_brain_list)

b_brain_all = b_brain_all.reset_index(drop=True)

print(b_brain_all.shape)

In [None]:
# reading sfari brain clusters

s_brain_0 = pd.read_csv("sfari_giant_brain_louvain/giant-0.txt", names=["entrez"])
s_brain_1 = pd.read_csv("sfari_giant_brain_louvain/giant-1.txt", names=["entrez"])
s_brain_2 = pd.read_csv("sfari_giant_brain_louvain/giant-2.txt", names=["entrez"])
s_brain_3 = pd.read_csv("sfari_giant_brain_louvain/giant-3.txt", names=["entrez"])

s_brain_0["cid"] = 0
s_brain_1["cid"] = 1
s_brain_2["cid"] = 2
s_brain_3["cid"] = 3

print(s_brain_0.shape)
print(s_brain_1.shape)
print(s_brain_2.shape)
print(s_brain_3.shape)

s_brain_list = [s_brain_0, s_brain_1, s_brain_2, s_brain_3]
s_brain_all = pd.concat(s_brain_list)

s_brain_all = s_brain_all.reset_index(drop=True)

print(s_brain_all.shape)

In [None]:
# let's see the intersection between two list of ALL entrez ids:

print(len(s_brain_all["entrez"].tolist()))
print(len(b_brain_all["entrez"].tolist()))

inters = [value for value in b_brain_all["entrez"].tolist() if value in s_brain_all["entrez"].tolist()]
print(len(inters))

In [None]:
# we are going to store intersection of clusters here 

df_result_brain = pd.DataFrame(0, index = [0,1,2,3], columns = [0,1,2,3])

# iterate over sfari genes in the small giant list
for i1, r1 in s_brain_all.iterrows():
    
    # search for the current sfari gene in the big giant list
    r2 = b_brain_all[b_brain_all["entrez"] == r1["entrez"]]
    
    # save the cluster ID
    cid2 = r2["cid"].values[0]
    
    # update the result dataframe
    df_result_brain.at[r1["cid"], cid2] += 1
            

print(df_result_brain)

Kidney

In [None]:
# reading full kidney clusters

b_kidney_0 = pd.read_csv("full_giant_kidney_louvain/giant-0.txt", names=["entrez"])
b_kidney_1 = pd.read_csv("full_giant_kidney_louvain/giant-1.txt", names=["entrez"])
b_kidney_2 = pd.read_csv("full_giant_kidney_louvain/giant-2.txt", names=["entrez"])

b_kidney_0["cid"] = 0
b_kidney_1["cid"] = 1
b_kidney_2["cid"] = 2

print(b_kidney_0.shape)
print(b_kidney_1.shape)
print(b_kidney_2.shape)

b_kidney_list = [b_kidney_0, b_kidney_1, b_kidney_2]
b_kidney_all = pd.concat(b_kidney_list)

b_kidney_all = b_kidney_all.reset_index(drop=True)

print(b_kidney_all.shape)

In [None]:
# reading sfari kidney clusters

s_kidney_0 = pd.read_csv("sfari_giant_kidney_louvain/giant-0.txt", names=["entrez"])
s_kidney_1 = pd.read_csv("sfari_giant_kidney_louvain/giant-1.txt", names=["entrez"])
s_kidney_2 = pd.read_csv("sfari_giant_kidney_louvain/giant-2.txt", names=["entrez"])
s_kidney_3 = pd.read_csv("sfari_giant_kidney_louvain/giant-3.txt", names=["entrez"])
s_kidney_4 = pd.read_csv("sfari_giant_kidney_louvain/giant-4.txt", names=["entrez"])

s_kidney_0["cid"] = 0
s_kidney_1["cid"] = 1
s_kidney_2["cid"] = 2
s_kidney_3["cid"] = 3
s_kidney_4["cid"] = 4

print(s_kidney_0.shape)
print(s_kidney_1.shape)
print(s_kidney_2.shape)
print(s_kidney_3.shape)
print(s_kidney_4.shape)

s_kidney_list = [s_kidney_0, s_kidney_1, s_kidney_2, s_kidney_3, s_kidney_4]
s_kidney_all = pd.concat(s_kidney_list)

s_kidney_all = s_kidney_all.reset_index(drop=True)

print(s_kidney_all.shape)

In [None]:
print(len(s_kidney_all["entrez"].tolist()))
print(len(b_kidney_all["entrez"].tolist()))

inters_kidney = [value for value in b_kidney_all["entrez"].tolist() if value in s_kidney_all["entrez"].tolist()]
print(len(inters_kidney))

# we are going to store intersection of clusters here 

df_result_kidney = pd.DataFrame(0, index = [0,1,2,3,4], columns = [0,1,2])

# iterate over sfari genes in the small giant list
for i1, r1 in s_kidney_all.iterrows():
    
    # search for the current sfari gene in the big giant list
    r2 = b_kidney_all[b_kidney_all["entrez"] == r1["entrez"]]
    
    # save the cluster ID
    cid2 = r2["cid"].values[0]
    
    # update the result dataframe
    df_result_kidney.at[r1["cid"], cid2] += 1
            
print(df_result_kidney)

Lung

In [None]:
# reading full lung clusters

b_lung_0 = pd.read_csv("full_giant_lung_louvain/giant-0.txt", names=["entrez"])
b_lung_1 = pd.read_csv("full_giant_lung_louvain/giant-1.txt", names=["entrez"])
b_lung_2 = pd.read_csv("full_giant_lung_louvain/giant-2.txt", names=["entrez"])

b_lung_0["cid"] = 0
b_lung_1["cid"] = 1
b_lung_2["cid"] = 2

print(b_lung_0.shape)
print(b_lung_1.shape)
print(b_lung_2.shape)

b_lung_list = [b_lung_0, b_lung_1, b_lung_2]
b_lung_all = pd.concat(b_lung_list)

b_lung_all = b_lung_all.reset_index(drop=True)

print(b_lung_all.shape)

In [None]:
# reading sfari lung clusters

s_lung_0 = pd.read_csv("sfari_giant_lung_louvain/giant-0.txt", names=["entrez"])
s_lung_1 = pd.read_csv("sfari_giant_lung_louvain/giant-1.txt", names=["entrez"])
s_lung_2 = pd.read_csv("sfari_giant_lung_louvain/giant-2.txt", names=["entrez"])
s_lung_3 = pd.read_csv("sfari_giant_lung_louvain/giant-3.txt", names=["entrez"])
s_lung_4 = pd.read_csv("sfari_giant_lung_louvain/giant-4.txt", names=["entrez"])

s_lung_0["cid"] = 0
s_lung_1["cid"] = 1
s_lung_2["cid"] = 2
s_lung_3["cid"] = 3
s_lung_4["cid"] = 4

print(s_lung_0.shape)
print(s_lung_1.shape)
print(s_lung_2.shape)
print(s_lung_3.shape)
print(s_lung_4.shape)

s_lung_list = [s_lung_0, s_lung_1, s_lung_2, s_lung_3, s_lung_4]
s_lung_all = pd.concat(s_lung_list)

s_lung_all = s_lung_all.reset_index(drop=True)

print(s_lung_all.shape)

In [None]:
print(len(s_lung_all["entrez"].tolist()))
print(len(b_lung_all["entrez"].tolist()))

inters_lung = [value for value in b_lung_all["entrez"].tolist() if value in s_lung_all["entrez"].tolist()]
print(len(inters_lung))

# we are going to store intersection of clusters here 
df_result_lung = pd.DataFrame(0, index = [0,1,2,3,4], columns = [0,1,2])

# iterate over sfari genes in the small giant list
for i1, r1 in s_lung_all.iterrows():
    
    # search for the current sfari gene in the big giant list
    r2 = b_lung_all[b_lung_all["entrez"] == r1["entrez"]]
    
    # save the cluster ID
    cid2 = r2["cid"].values[0]
    
    # update the result dataframe
    df_result_lung.at[r1["cid"], cid2] += 1
            
print(df_result_lung)

In [None]:
brain = df_result_brain.to_numpy()
print(brain)

kidney = df_result_kidney.to_numpy()
print(kidney)

lung = df_result_lung.to_numpy()
print(lung)

In [None]:
brain_e = expected_m(brain)
kidney_e = expected_m(kidney)
lung_e = expected_m(lung)

brain_chi = chisq(brain, brain_e)
kidney_chi = chisq(kidney, kidney_e)
lung_chi = chisq(lung, lung_e)

# chi-squared volues
print(brain_chi)
print(kidney_chi)
print(liver_chi)

# Cramér's V values
print(cramersv(brain, brain_chi))
print(cramersv(kidney, kidney_chi))
print(cramersv(lung, lung_chi))