In [1]:
import os

while os.getcwd()[-len('mcrc-cetuximab-analysis'):] != 'mcrc-cetuximab-analysis':
    os.chdir('..')
os.getcwd() 

'/home/max/mcrc-cetuximab-analysis'

In [2]:
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
# Parse info on gene names, lengths from a GTF file
# no need to rerun

def parse_gtf(gtf_path):
    ensemble_gene_id_to_length = Counter()
    ensemble_gene_id_to_version = dict()
    ensemble_gene_id_to_hgnc = dict()
    ensemble_gene_id_to_biotype = dict()
    with open(gtf_path, "r") as f:
        for li, line in enumerate(f):
            # print(line)
            if line.startswith("#"):
                continue
            
            parts = line.strip().split()
            chromosome = parts[0]
            entry_source_0 = parts[1]
            entry_type = parts[2]
            start = int(parts[3])
            end = int(parts[4])
            info = dict()
            for i in range(8, len(parts) - 1, 2):
                key = parts[i]
                val = parts[i + 1][1:-2]
                if key == 'gene_name' and val == 'havana':
                    print(key, val, i)
                info[key] = val
            
            if entry_type == 'gene' and 'gene_id' in info and 'gene_name' in info:
                ensemble_gene_id_to_hgnc[info['gene_id']] = info['gene_name']
                if 'gene_biotype' in info:
                    ensemble_gene_id_to_biotype[info['gene_id']] = info['gene_biotype']
            elif entry_type == 'exon':
                version = info['gene_version']
                ensemble_gene_id = info['gene_id']
                ensemble_gene_id_to_version[ensemble_gene_id] = version
                ensemble_gene_id_to_length[ensemble_gene_id] += end - start + 1
                    
    return ensemble_gene_id_to_hgnc, ensemble_gene_id_to_length, ensemble_gene_id_to_version, ensemble_gene_id_to_biotype

ensemble_gene_id_to_hgnc, ensemble_gene_id_to_length, \
 ensemble_gene_id_to_version, ensemble_gene_id_to_biotype = parse_gtf('data/Homo_sapiens.GRCh38.113.gtf')

cnt = Counter()
hgnc_to_ensemble_set = dict()
for key in ensemble_gene_id_to_hgnc:
# for key in ensemble_genes_common:
    hgnc = ensemble_gene_id_to_hgnc[key]
    cnt[hgnc] += 1
    if cnt[hgnc] == 1:
        hgnc_to_ensemble_set[hgnc] = {key}
    else:
        hgnc_to_ensemble_set[hgnc].add(key)


In [10]:
fpkms_df = pd.read_csv("raw/GSE183984_ASAN_RNASEQ_FPKM_ensg.csv", index_col=0).T
raw_counts_df = pd.read_csv("raw/GSE183984_ASAN_RNASEQ_raw_counts_ensg.csv", index_col=0).T
data_ensemble_genes = list(fpkms_df.columns)

tpms_df = fpkms_df.T.div(fpkms_df.sum(axis=1)).T * 10 ** 6

In [11]:
def process_columns(df, shrink_to_bg=False):
    print('Initial df cols: ', len(df.columns))
    df = df.drop(columns=list(set(df.columns) - set(ensemble_gene_id_to_hgnc.keys())))
    print('After dropping cols not annotated with HGNC name in GTF: ', len(df.columns))
    
    renamer = dict()
    for egene in ensemble_gene_id_to_hgnc:
        if ensemble_gene_id_to_biotype[egene] == 'protein_coding':
            renamer[egene] = ensemble_gene_id_to_hgnc[egene]

    df = df.rename(columns=renamer)
    df = df.drop(columns=[col for col in df.columns if col.startswith('ENSG')])
    
    print('After renaming to HGNC (and deleting non-protein coding egenes): ', len(df.columns))

    if shrink_to_bg:
        df = df[list(common_hgncs.intersection(df.columns))]
        print('After shrinking to BG HGNCs that are in the table: ', len(df.columns))
    
    mask = df.columns.duplicated()
    for i in range(len(mask)):
        if mask[i]:
            j = 0
            while df.columns[j] != df.columns[i]:
                j += 1
            df[df.columns[j]] += df[df.columns[i]]

    df = df.loc[:,~df.columns.duplicated()]

    print('After summing columns with the same HGNC name: ', len(df.columns))

    return df

raw_counts_df = process_columns(raw_counts_df)
tpms_df = process_columns(tpms_df)

# Rescale TPMs after deleting and summing genes
tpms_df = tpms_df.div(tpms_df.sum(axis=1), axis='index') * 10 ** 6
print(tpms_df.sum(axis=1))

Initial df cols:  58735
After dropping cols not annotated with HGNC name in GTF:  42148
After renaming to HGNC (and deleting non-protein coding egenes):  19394
After summing columns with the same HGNC name:  19388
Initial df cols:  58735
After dropping cols not annotated with HGNC name in GTF:  42148
After renaming to HGNC (and deleting non-protein coding egenes):  19394
After summing columns with the same HGNC name:  19388
18R176_0020    1000000.0
18R177_0021    1000000.0
18R178_0022    1000000.0
18R296_0008    1000000.0
18R301_0002    1000000.0
                 ...    
18R309_0015    1000000.0
18R310_0016    1000000.0
18R311_0018    1000000.0
18R318_0005    1000000.0
18R319_0014    1000000.0
Length: 113, dtype: float64


In [12]:
import pandas as pd

def extend_id(id):
    fp, sp = id.split('_')
    while len(sp) < 4:
        sp = '0' + sp
    return fp + '_' + sp

# Load annotation
ann = pd.read_csv("data/ann.csv", index_col=0)
ann_unfiltered = pd.read_csv("data/ann_unfiltered.csv", index_col=0)

tpms_df.index = tpms_df.index.map(extend_id)
raw_counts_df.index = raw_counts_df.index.map(extend_id)

# Leave only samples from annotation

tpms_df_unfiltered = tpms_df.loc[ann_unfiltered.index]
raw_counts_df_unfiltered = raw_counts_df.loc[ann_unfiltered.index]

tpms_df = tpms_df.loc[ann.index]
raw_counts_df = raw_counts_df.loc[ann.index]

log_tpms_df = tpms_df.apply(lambda x: np.log2(1 + x))
log_tpms_df_unfiltered = tpms_df_unfiltered.apply(lambda x: np.log2(1 + x))

In [13]:
log_tpms_df.to_csv('data/log_tpms_from_fpkm_hgnc_filtered_by_ann.csv', index=True)
raw_counts_df.to_csv('data/raw_counts_hgnc_filtered_by_ann.csv', index=True)

log_tpms_df_unfiltered.to_csv('data/log_tpms_from_fpkm_hgnc_unfiltered.csv', index=True)
raw_counts_df_unfiltered.to_csv('data/raw_counts_hgnc_unfiltered.csv', index=True)

In [14]:
our_hgncs = set(log_tpms_df.columns)
bg_hgncs = set(open('data/gene_lists/bg_gnames.txt').read().strip().split())

print('Excess genes compared to BG list: ', len(our_hgncs - bg_hgncs))
print('Missing genes from BG list: ', len(bg_hgncs - our_hgncs))

common_hgncs = our_hgncs.intersection(bg_hgncs)
print('Common genes: ', len(common_hgncs))

for key in our_hgncs:
    if len(hgnc_to_ensemble_set[key]) > 1 and key is not None:
        print(key)
        print([(egene, ensemble_gene_id_to_length[egene], ensemble_gene_id_to_biotype[egene], ensemble_gene_id_to_version[egene]) for egene in hgnc_to_ensemble_set[key]])

Excess genes compared to BG list:  1444
Missing genes from BG list:  2118
Common genes:  17944
DHRSX
[('ENSG00000169084', 6400, 'protein_coding', '15'), ('ENSG00000292338', 6400, 'protein_coding', '1')]
ASMT
[('ENSG00000196433', 4358, 'protein_coding', '13'), ('ENSG00000292336', 4358, 'protein_coding', '1')]
IL9R
[('ENSG00000124334', 6876, 'protein_coding', '19'), ('ENSG00000292373', 6876, 'protein_coding', '2')]
SPATA13
[('ENSG00000182957', 41488, 'protein_coding', '16'), ('ENSG00000228741', 1721, 'lncRNA', '2')]
CSF2RA
[('ENSG00000292357', 28693, 'protein_coding', '2'), ('ENSG00000198223', 28693, 'protein_coding', '19')]
SIGLEC5
[('ENSG00000268500', 5713, 'protein_coding', '8'), ('ENSG00000105501', 553, 'lncRNA', '13')]
PINX1
[('ENSG00000258724', 1753, 'protein_coding', '1'), ('ENSG00000254093', 6770, 'protein_coding', '9')]
PDE4C
[('ENSG00000285188', 5979, 'protein_coding', '1'), ('ENSG00000105650', 27984, 'protein_coding', '23')]
GTPBP6
[('ENSG00000292358', 7181, 'protein_coding', 

In [15]:
print('ENS genes with known HGNC name: ', len(ensemble_gene_id_to_hgnc.keys()))
print('ENS genes in raw data: ', len(data_ensemble_genes))
print('ENS genes with known HGNC that are not in raw data: ', len(set(ensemble_gene_id_to_hgnc.keys()) - set(data_ensemble_genes)))
print('ENS genes in raw data for which we don\'t know HGNC: ', len(set(data_ensemble_genes) - set(ensemble_gene_id_to_hgnc.keys())))

ensemble_genes_common = list(set(data_ensemble_genes).intersection(ensemble_gene_id_to_hgnc.keys()))
print('ENS genes from data with known HGNC: ', len(ensemble_genes_common))

ENS genes with known HGNC name:  42745
ENS genes in raw data:  58735
ENS genes with known HGNC that are not in raw data:  597
ENS genes in raw data for which we don't know HGNC:  16587
ENS genes from data with known HGNC:  42148


In [16]:
log_tpms_df.max(axis=1).max()

15.321326451111348