In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

## Collecting expression data per transcript

In [2]:
expression_full_annot = pd.read_csv('/home/bioinf/gnomad-variants/raw_data/expression_levels_data/GTEx_table.csv', delimiter = '\t')
# This data is not available from the github repository due to its large size
len(expression_full_annot)

199324

In [3]:
expression_full_annot.head()

Unnamed: 0,transcript_id,gene_id,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,...,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
0,ENST00000373020.8,ENSG00000000003.14,26.32,3.95,13.23,30.15,6.6,38.7,29.3,19.25,...,5.73,16.51,17.06,8.28,3.96,68.85,30.73,5.05,1.67,32.91
1,ENST00000494424.1,ENSG00000000003.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENST00000496771.5,ENSG00000000003.14,2.53,0.48,1.98,4.18,0.38,2.96,11.2,1.25,...,0.8,0.88,0.96,1.03,0.73,28.48,0.89,0.56,0.5,2.14
3,ENST00000612152.4,ENSG00000000003.14,0.47,0.21,0.15,1.23,0.31,1.26,1.41,0.56,...,0.23,0.33,0.59,0.45,0.24,5.59,0.2,0.2,0.14,0.99
4,ENST00000614008.4,ENSG00000000003.14,0.23,0.24,1.32,1.15,0.74,0.0,5.49,0.0,...,0.3,0.0,0.0,0.18,0.45,4.23,0.0,0.42,0.0,0.39


### Adding sample annotation

In [4]:
name_annot = pd.read_csv('/home/bioinf/gnomad-variants/raw_data/expression_levels_data/sample_names_annotation.csv', delimiter = ',')

In [5]:
name_annot.head()

Unnamed: 0,SAMPID,SMTS,SMTSD
0,GTEX-1117F-0003-SM-58Q7G,Blood,Whole Blood
1,GTEX-1117F-0003-SM-5DWSB,Blood,Whole Blood
2,GTEX-1117F-0003-SM-6WBT7,Blood,Whole Blood
3,GTEX-1117F-0011-R10a-SM-AHZ7F,Brain,Brain - Frontal Cortex (BA9)
4,GTEX-1117F-0011-R10b-SM-CYKQ8,Brain,Brain - Frontal Cortex (BA9)


In [6]:
new_column_names = dict(zip(name_annot['SAMPID'], name_annot['SMTS']))

In [7]:
len(expression_full_annot.columns.unique())

17384

### Calculating median expression for each transcript in tissue type

In [8]:
for old_name, new_name in new_column_names.items():
    if old_name in expression_full_annot.columns:
        expression_full_annot.rename(columns={old_name: new_name}, inplace=True)

In [9]:
counter = 0
for col_name in set(expression_full_annot.columns):
    if list(expression_full_annot.columns).count(col_name) > 1:
        data = np.median(expression_full_annot[col_name].values, axis=1).reshape(len(expression_full_annot),1)
        new_df = pd.DataFrame(data, columns=[col_name])
        expression_full_annot = pd.concat([expression_full_annot.drop(col_name, axis=1), new_df], axis=1)
        counter += 1
print(counter)
display(expression_full_annot.head())

30


Unnamed: 0,transcript_id,gene_id,Stomach,Thyroid,Colon,Small Intestine,Pancreas,Pituitary,Spleen,Prostate,...,Vagina,Blood,Adipose Tissue,Breast,Skin,Liver,Adrenal Gland,Nerve,Bladder,Testis
0,ENST00000373020.8,ENSG00000000003.14,8.56,17.25,10.49,13.8,6.535,33.73,7.16,16.02,...,22.33,0.04,24.06,27.65,9.4,21.13,15.265,24.34,12.37,42.75
1,ENST00000494424.1,ENSG00000000003.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.97
2,ENST00000496771.5,ENSG00000000003.14,0.58,1.12,2.06,1.1,0.29,22.57,0.61,1.88,...,2.31,0.0,1.77,2.0,0.47,0.4,0.465,3.87,1.96,4.73
3,ENST00000612152.4,ENSG00000000003.14,0.39,0.71,0.65,0.64,0.295,1.81,0.34,0.84,...,0.83,0.0,0.96,1.2,0.33,0.57,0.485,1.55,0.64,0.6
4,ENST00000614008.4,ENSG00000000003.14,0.11,0.1,0.45,0.48,0.0,2.72,0.23,0.22,...,0.5,0.0,0.78,0.52,0.11,0.0,0.06,0.82,0.23,48.55


### Adding max value of median per transcript

In [11]:
max_median = expression_full_annot.iloc[:, 2:].max(axis=1)

In [12]:
max_median_expr = expression_full_annot.iloc[:, :2].copy()
max_median_expr['Max_median_expression'] = max_median
new_column_names = ['ID_transcript', 'ID_gene', 'Max_median_expression']

max_median_expr.columns = new_column_names

max_median_expr.head()

Unnamed: 0,ID_transcript,ID_gene,Max_median_expression
0,ENST00000373020.8,ENSG00000000003.14,47.1
1,ENST00000494424.1,ENSG00000000003.14,17.97
2,ENST00000496771.5,ENSG00000000003.14,22.57
3,ENST00000612152.4,ENSG00000000003.14,3.51
4,ENST00000614008.4,ENSG00000000003.14,48.55


### Appending data to file

In [13]:
#max_median_expr.to_csv('/home/bioinf/gnomad-variants/Grishchenko/processed_data/tables/max_tissue_median_expr.tsv', sep='\t', index=False)