In [16]:
import pandas as pd
import networkx as nx
import pickle
import numpy as np
from scipy.stats import spearmanr
from statsmodels.stats.multitest import multipletests

In [116]:
tissue_samples = pd.read_csv('GTEx_v7_Annotations_SampleAttributesDS.txt', sep = '\t')
tissue_list = tissue_samples.loc[tissue_samples['SMTSD'] == 'Muscle - Skeletal']['SAMPID'].to_numpy()

In [117]:
columns_data = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', index_col = 'Name', sep = '\t', header = 2 , nrows = 2)
available_cols = columns_data.columns
common_columns = list(set(available_cols).intersection(tissue_list))
common_columns.append('Name')
print(len(common_columns))

565


In [86]:
variance_dataframe = pd.DataFrame(columns = ['Name', 'Variance'])

chunksize = 2000
for chunk in pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', index_col = 'Name', usecols = common_columns, sep = '\t', header = 2 , chunksize=chunksize):
    for i in chunk.index.values:
        variance = np.var(chunk.loc[i].values)
        #print(variance)
        row = pd.Series([i, variance], index=['Name', 'Variance'])
        variance_dataframe = variance_dataframe.append(row, ignore_index=True)
    
variance_dataframe.index.values

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


array([    0,     1,     2, ..., 56199, 56200, 56201])

In [131]:
variance_dataframe.to_csv('variance_dataset.csv', index=False)

In [87]:
all_rows = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', usecols = ['Name'], sep = '\t', header = 2)

Unnamed: 0,Name
0,ENSG00000223972.4
1,ENSG00000227232.4
2,ENSG00000243485.2
3,ENSG00000237613.2
4,ENSG00000268020.2
...,...
56197,ENSG00000198695.2
56198,ENSG00000210194.1
56199,ENSG00000198727.2
56200,ENSG00000210195.2


In [110]:
rows = variance_dataframe.nlargest(200, 'Variance').index.values
rows = np.append(rows, [0, 1, 2])
print(len(rows))
skip_rows = np.setdiff1d(all_rows.index.values,rows)
skip_rows

203


array([    3,     4,     5, ..., 56198, 56200, 56201])

In [114]:
def logic(index):
    if index in skip_rows:
        return True
    return False

small_dataset = pd.read_csv('GTEx_Analysis_2016-01-15_v7_RNASeQCv1.1.8_gene_tpm.gct', index_col = 'Name', usecols = common_columns, sep = '\t', header = 2 , skiprows= lambda x: logic(x))

In [115]:
small_dataset

Unnamed: 0_level_0,GTEX-1117F-0426-SM-5EGHI,GTEX-111CU-2026-SM-5GZZC,GTEX-111FC-0326-SM-5GZZ1,GTEX-111VG-2626-SM-5GZY2,GTEX-111YS-2326-SM-5987L,GTEX-1122O-2426-SM-5GIDN,GTEX-1128S-2426-SM-5H11B,GTEX-113JC-2726-SM-5EGIS,GTEX-117XS-2526-SM-5H11G,GTEX-117YW-2426-SM-5Q5AE,...,GTEX-ZYFC-0526-SM-5GIDF,GTEX-ZYFD-0326-SM-5NQ8I,GTEX-ZYFG-2426-SM-5GIE8,GTEX-ZYT6-1626-SM-5E45R,GTEX-ZYVF-0626-SM-5E43Q,GTEX-ZYW4-0526-SM-5GZZ5,GTEX-ZYY3-0526-SM-5E45G,GTEX-ZZ64-1526-SM-5E43K,GTEX-ZZPT-0626-SM-5GZXT,GTEX-ZZPU-2626-SM-5E45Y
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000239664.2,0.1388,0.10160,0.18120,0.00000,0.00000,0.00000,0.07281,0.11910,0.00000,0.0000,...,0.00000,0.11890,0.1049,0.08330,0.00000,0.0000,0.00000,0.00000,0.14250,0.1002
ENSG00000230021.3,0.2196,0.10050,0.07167,0.03801,0.09146,0.02708,0.01440,0.07068,0.05287,0.0521,...,0.10920,0.02351,0.1452,0.04942,0.06495,0.0000,0.11100,0.09546,0.08454,0.0991
ENSG00000223659.1,0.0000,0.04244,0.03784,0.12040,0.04829,0.00000,0.03041,0.14930,0.03722,0.1100,...,0.09226,0.00000,0.1752,0.03479,0.00000,0.1393,0.03906,0.08065,0.11900,0.0000
ENSG00000237973.1,72.7800,89.10000,121.10000,82.15000,92.45000,106.90000,277.20000,105.50000,118.00000,113.1000,...,10440.00000,312.20000,69.8200,145.00000,62.23000,79.0300,152.40000,415.30000,93.62000,125.0000
ENSG00000229344.1,30.4800,18.86000,32.80000,13.74000,31.69000,17.55000,18.85000,28.44000,17.60000,17.7300,...,18.99000,24.93000,34.9700,39.61000,22.86000,15.3400,96.23000,34.81000,16.94000,31.2100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000210184.1,0.0000,3.46100,0.51440,0.00000,1.31300,1.55500,0.82690,1.35300,1.01200,0.9972,...,0.00000,0.00000,0.5954,2.83800,1.24300,0.0000,1.59300,0.54820,0.80910,2.2760
ENSG00000198786.2,10280.0000,14280.00000,6258.00000,6983.00000,10920.00000,12560.00000,14200.00000,7985.00000,11240.00000,11350.0000,...,10430.00000,7660.00000,18860.0000,6213.00000,4777.00000,11210.0000,6843.00000,13270.00000,4408.00000,14710.0000
ENSG00000198727.2,26010.0000,33250.00000,35060.00000,18360.00000,31320.00000,37060.00000,38790.00000,31520.00000,28310.00000,29780.0000,...,26400.00000,31510.00000,36380.0000,30270.00000,23330.00000,21030.0000,28210.00000,30740.00000,28450.00000,36240.0000
ENSG00000210195.2,6.3410,1.03100,1.83900,0.48780,2.34700,0.69500,2.58700,0.60470,0.00000,2.2290,...,125.60000,4.22300,3.7260,0.42280,0.00000,2.2570,3.32200,2.94000,0.72330,47.3100


In [133]:
df = pd.read_csv('variance_dataset.csv')