In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import clustering functions
from actual_labels import get_actual_labels
from hclust import hclust

In [2]:
# Load in data
DATA_PATH = Path("data") / "SRP120552.tsv"
df = pd.read_csv(DATA_PATH, sep="\t")
df = df.set_index("Gene")

df

Unnamed: 0_level_0,SRR6191750,SRR6191751,SRR6191752,SRR6191753,SRR6191754,SRR6191755,SRR6191756,SRR6191757,SRR6191758,SRR6191759,...,SRR6192398,SRR6192399,SRR6192400,SRR6192401,SRR6192402,SRR6192403,SRR6192404,SRR6192405,SRR6192406,SRR6192407
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003,56.629600,210.765150,0.000000,15.582642,0.000000,3.951188,1.013635,10.157310,12.500870,1.535458,...,143.910830,122.749306,0.000000,122.250435,94.166710,147.495970,0.000000,369.399140,0.000000,2.995046
ENSG00000000005,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000000419,52.532257,181.474520,250.384250,61.192110,0.854224,29.988983,32.392227,43.866116,34.768540,10.885117,...,0.000000,117.458120,0.000000,9.408690,56.129097,112.473976,7.008246,0.000000,248.942280,0.000000
ENSG00000000457,6.420248,9.038753,6.449249,0.896064,9.391796,1.694619,1.771045,0.890477,15.431188,6.612335,...,0.756136,0.789532,2.880358,3.794541,3.027338,0.790558,3.477984,2.319273,2.393112,236.096450
ENSG00000000460,0.000000,27.647682,0.000000,140.431730,0.000000,15.594472,10.486232,1.409931,43.129833,3.073701,...,0.000000,0.000000,0.000000,0.000000,3.229686,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000286268,0.000000,0.000000,0.000000,0.000000,0.000000,2.140921,0.000000,0.000000,0.000000,1.921149,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000286269,3.012283,6.357253,0.000000,2.920779,12.417953,2.216796,3.025142,2.377553,2.266806,0.000000,...,1.006215,0.000000,0.835804,1.811789,4.883857,1.861232,6.817812,0.000000,0.000000,0.000000
ENSG00000286270,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSG00000286271,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.806760,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [3]:
# Filter and sort for top 5000 variable genes
gene_variability = df.T.var(axis=0)
top_5000_indices = gene_variability.sort_values(ascending=False).head(5000).index
top_5000 = df.loc[top_5000_indices].T

top_5000

Gene,ENSG00000210082,ENSG00000198804,ENSG00000198712,ENSG00000198727,ENSG00000198886,ENSG00000198899,ENSG00000198938,ENSG00000251562,ENSG00000198763,ENSG00000211459,...,ENSG00000188785,ENSG00000223960,ENSG00000249915,ENSG00000078687,ENSG00000181817,ENSG00000084733,ENSG00000051620,ENSG00000167693,ENSG00000249437,ENSG00000187742
SRR6191750,5599.750,4601.987300,4125.2964,2734.20140,1673.38540,2339.328400,2439.4944,5277.0170,1081.060700,2619.785600,...,5.119839,24.550419,67.210266,1.751851,12.532421,15.421330,11.990015,0.686648,0.999760,30.020580
SRR6191751,7230.140,7942.763700,10058.2920,6546.20070,4487.42100,4911.369000,4940.0130,25557.4600,3281.558300,1165.463900,...,1.073328,308.631400,284.015800,25.872652,122.215230,262.426970,99.676315,228.725770,134.215480,122.478540
SRR6191752,735357.440,391568.720000,352864.0000,387255.40000,225261.05000,270033.800000,238023.1000,6391.4330,84635.860000,66180.195000,...,11.065000,2.761192,0.000000,0.000000,0.000000,0.000000,14.618929,7.209654,6.446755,6.914004
SRR6191753,7469.360,9359.391000,5511.1875,3957.72220,3340.21120,3572.576000,3550.6338,3499.8147,1997.790500,2604.347400,...,25.790913,17.855942,17.995600,49.667503,0.000000,27.020786,1.286581,4.311105,7.159112,36.046585
SRR6191754,8766.538,85.343575,7439.1680,56.53336,20.48142,54.685753,2989.6790,1256.1317,13.072073,19.884577,...,0.709885,10.790157,914.100500,0.000000,0.000000,0.000000,0.000000,0.000000,0.807569,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR6192403,19831.766,4976.292500,13185.2320,8407.10000,6141.42870,4734.336000,7034.2446,52304.2540,3995.972400,1731.459200,...,0.000000,10.141361,49.206787,57.695312,146.490880,39.218490,2.148382,2.361594,8.276861,200.623840
SRR6192404,18816.709,9354.555000,13853.0820,6059.80760,5923.35640,4176.737300,5483.9990,23306.3200,3084.486800,1803.639200,...,2.413695,34.403870,7.847596,0.000000,49.691290,0.000000,27.013287,0.000000,11.660089,6.002738
SRR6192405,8673.412,2280.387200,5620.2540,1360.71750,2601.98630,1368.796400,1581.3860,39778.1130,1863.543500,1039.775600,...,0.000000,46.588814,119.457540,179.151720,149.870530,0.000000,0.000000,0.000000,16.234000,43.513664
SRR6192406,15588.810,2159.225600,9419.2050,6166.48440,5105.62650,3596.666300,5628.1340,31749.2800,3701.223000,1531.713700,...,0.000000,0.000000,225.497710,51.955288,1.880684,249.126790,0.815208,0.000000,121.443530,1.010505


In [4]:
# Get actual labels
original_labels = get_actual_labels()

original_labels

ValueError: maximum supported dimension for an ndarray is 32, found 656

In [26]:
# Get labels for k=8
k = 8

# Hierarchical clustering
hclust_labels = hclust(top_5000, k)

type(hclust_labels)



numpy.ndarray