# Data pre-processing

## Setup -CHECK SPELLING MISTAKES

Import relevant libraries

In [1]:
# Import libs
import os.path
from os import path
import gzip
import pandas as pd
import numpy as np
import urllib.request
from gtfparse import read_gtf
from biomart import BiomartServer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

Create function for printing out rows and cols

In [2]:
def print_rowcol(pretext, df):
    template = pretext + ': {} rows x {} cols'
    print(template.format(str(df.shape[0]), str(df.shape[1])))

Setup paths - main data path is folder inside the code repository where the data dowloaded from download-data has been stored

In [3]:
data_path = '../data'
gtf_path = data_path + '/Mus_musculus.GRCm38.99.gtf'
tpm_yang_path = data_path + '/GSE90848_Ana6_basal_hair_bulb_TPM.txt'
tpm_yang_path2 = data_path + '/GSE90848_Tel_Ana1_Ana2_bulge_HG_basal_HB_TPM.txt'
tpm_joost_path = data_path + '/GSE67602_Joost_et_al_expression.txt'
tpm_ghahramani_path = data_path + '/GSE99989_NCA_BCatenin_TPM_matrix.csv'
gene_tsv_path = data_path + '/gene_names.tsv'
gtf_data_path = data_path + '/gtf.txt'

Set the main thresholds for the pre-processing
- min_num_genes_in_cell_exp - is the min number of genes that need to have expression > **min_ltpm_exp** for it to be considered a valid cell
- min_num_cells_for_gene_exp - is the min number of cells that need to have an expression for that gene > **min_ltpm_exp** for it to be considered a valid gene

In [4]:
# Thresholds
min_num_genes_in_cell_exp = 1000
min_num_cells_for_gene_exp = 500
min_ltpm_exp = 1

# Train/Test
test_data_size = 500

## Load and process gene data

Load GTF data and show some data

In [5]:
if path.exists(gtf_data_path) is False:
    df_gtf = read_gtf(gtf_path)
    df_gtf.to_csv(gtf_data_path)

df_gtf = pd.read_csv(gtf_data_path)
print_rowcol('Loaded GTF', df_gtf)
df_gtf

  interactivity=interactivity, compiler=compiler, result=result)


Loaded GTF: 1868330 rows x 27 cols


Unnamed: 0.1,Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,gene_id,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,ccds_id,protein_id,protein_version
0,0,1,havana,gene,3073253,3074322,,+,0,ENSMUSG00000102693,...,,,,,,,,,,
1,1,1,havana,transcript,3073253,3074322,,+,0,ENSMUSG00000102693,...,havana,TEC,basic,,,,,,,
2,2,1,havana,exon,3073253,3074322,,+,0,ENSMUSG00000102693,...,havana,TEC,basic,,1.0,ENSMUSE00001343744,1.0,,,
3,3,1,ensembl,gene,3102016,3102125,,+,0,ENSMUSG00000064842,...,,,,,,,,,,
4,4,1,ensembl,transcript,3102016,3102125,,+,0,ENSMUSG00000064842,...,ensembl,snRNA,basic,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868325,1868325,JH584295.1,ensembl,CDS,708,752,,-,2,ENSMUSG00000095742,...,ensembl,protein_coding,basic,5.0,5.0,,,,ENSMUSP00000137004,1.0
1868326,1868326,JH584295.1,ensembl,exon,565,633,,-,0,ENSMUSG00000095742,...,ensembl,protein_coding,basic,5.0,6.0,ENSMUSE00000997159,1.0,,,
1868327,1868327,JH584295.1,ensembl,CDS,565,633,,-,2,ENSMUSG00000095742,...,ensembl,protein_coding,basic,5.0,6.0,,,,ENSMUSP00000137004,1.0
1868328,1868328,JH584295.1,ensembl,exon,66,109,,-,0,ENSMUSG00000095742,...,ensembl,protein_coding,basic,5.0,7.0,ENSMUSE00001007635,1.0,,,


Calculate feature lengths

In [6]:
df_gtf.insert(6,"feature_len", df_gtf.end - df_gtf.start)
df_gtf

INFO:numexpr.utils:NumExpr defaulting to 6 threads.


Unnamed: 0.1,Unnamed: 0,seqname,source,feature,start,end,feature_len,score,strand,frame,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,ccds_id,protein_id,protein_version
0,0,1,havana,gene,3073253,3074322,1069,,+,0,...,,,,,,,,,,
1,1,1,havana,transcript,3073253,3074322,1069,,+,0,...,havana,TEC,basic,,,,,,,
2,2,1,havana,exon,3073253,3074322,1069,,+,0,...,havana,TEC,basic,,1.0,ENSMUSE00001343744,1.0,,,
3,3,1,ensembl,gene,3102016,3102125,109,,+,0,...,,,,,,,,,,
4,4,1,ensembl,transcript,3102016,3102125,109,,+,0,...,ensembl,snRNA,basic,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868325,1868325,JH584295.1,ensembl,CDS,708,752,44,,-,2,...,ensembl,protein_coding,basic,5.0,5.0,,,,ENSMUSP00000137004,1.0
1868326,1868326,JH584295.1,ensembl,exon,565,633,68,,-,0,...,ensembl,protein_coding,basic,5.0,6.0,ENSMUSE00000997159,1.0,,,
1868327,1868327,JH584295.1,ensembl,CDS,565,633,68,,-,2,...,ensembl,protein_coding,basic,5.0,6.0,,,,ENSMUSP00000137004,1.0
1868328,1868328,JH584295.1,ensembl,exon,66,109,43,,-,0,...,ensembl,protein_coding,basic,5.0,7.0,ENSMUSE00001007635,1.0,,,


Filter GTF data for exons, 3' and 5' UTRs

In [7]:
df_gtf = df_gtf[(df_gtf.feature=='exon') | (df_gtf.feature=='three_prime_utr') | (df_gtf.feature=='five_prime_utr')]
print_rowcol('Filtered GTF for exons and UTR', df_gtf)
df_gtf

Filtered GTF for exons and UTR: 1025718 rows x 28 cols


Unnamed: 0.1,Unnamed: 0,seqname,source,feature,start,end,feature_len,score,strand,frame,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,ccds_id,protein_id,protein_version
2,2,1,havana,exon,3073253,3074322,1069,,+,0,...,havana,TEC,basic,,1.0,ENSMUSE00001343744,1.0,,,
5,5,1,ensembl,exon,3102016,3102125,109,,+,0,...,ensembl,snRNA,basic,,1.0,ENSMUSE00000522066,1.0,,,
8,8,1,havana,exon,3213609,3216344,2735,,-,0,...,havana,processed_transcript,,1.0,1.0,ENSMUSE00000858910,1.0,,,
9,9,1,havana,exon,3205901,3207317,1416,,-,0,...,havana,processed_transcript,,1.0,2.0,ENSMUSE00000866652,1.0,,,
11,11,1,havana,exon,3213439,3215632,2193,,-,0,...,havana,processed_transcript,,1.0,1.0,ENSMUSE00000863980,1.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1868320,1868320,JH584295.1,ensembl,exon,986,1009,23,,-,0,...,ensembl,protein_coding,basic,5.0,3.0,ENSMUSE00001085113,1.0,,,
1868322,1868322,JH584295.1,ensembl,exon,972,981,9,,-,0,...,ensembl,protein_coding,basic,5.0,4.0,ENSMUSE00001070510,1.0,,,
1868324,1868324,JH584295.1,ensembl,exon,708,752,44,,-,0,...,ensembl,protein_coding,basic,5.0,5.0,ENSMUSE00001015770,1.0,,,
1868326,1868326,JH584295.1,ensembl,exon,565,633,68,,-,0,...,ensembl,protein_coding,basic,5.0,6.0,ENSMUSE00000997159,1.0,,,


Aggregate the feature lengths to find a transcript length for each gene

In [8]:
df_gtf_transcript_len = pd.DataFrame(df_gtf.groupby(['gene_id']).sum()['feature_len'])
df_gtf_transcript_len

Unnamed: 0_level_0,feature_len
gene_id,Unnamed: 1_level_1
ENSMUSG00000000001,5447
ENSMUSG00000000003,2240
ENSMUSG00000000028,7202
ENSMUSG00000000031,8690
ENSMUSG00000000037,34434
...,...
ENSMUSG00000118636,237
ENSMUSG00000118637,170
ENSMUSG00000118638,2398
ENSMUSG00000118639,146


Check it makes sense

In [9]:
df_gtf_transcript_len[df_gtf_transcript_len.index.isin(['ENSMUSG00000051951'])]

Unnamed: 0_level_0,feature_len
gene_id,Unnamed: 1_level_1
ENSMUSG00000051951,12457


In [10]:
df_gtf[df_gtf.gene_id.isin(['ENSMUSG00000051951'])]

Unnamed: 0.1,Unnamed: 0,seqname,source,feature,start,end,feature_len,score,strand,frame,...,transcript_source,transcript_biotype,tag,transcript_support_level,exon_number,exon_id,exon_version,ccds_id,protein_id,protein_version
8,8,1,havana,exon,3213609,3216344,2735,,-,0,...,havana,processed_transcript,,1.0,1.0,ENSMUSE00000858910,1.0,,,
9,9,1,havana,exon,3205901,3207317,1416,,-,0,...,havana,processed_transcript,,1.0,2.0,ENSMUSE00000866652,1.0,,,
11,11,1,havana,exon,3213439,3215632,2193,,-,0,...,havana,processed_transcript,,1.0,1.0,ENSMUSE00000863980,1.0,,,
12,12,1,havana,exon,3206523,3207317,794,,-,0,...,havana,processed_transcript,,1.0,2.0,ENSMUSE00000867897,1.0,,,
14,14,1,ensembl_havana,exon,3670552,3671498,946,,-,0,...,ensembl_havana,protein_coding,"CCDS,basic",1.0,1.0,ENSMUSE00000485541,3.0,CCDS14803,,
17,17,1,ensembl_havana,exon,3421702,3421901,199,,-,0,...,ensembl_havana,protein_coding,"CCDS,basic",1.0,2.0,ENSMUSE00000449517,3.0,CCDS14803,,
19,19,1,ensembl_havana,exon,3214482,3216968,2486,,-,0,...,ensembl_havana,protein_coding,"CCDS,basic",1.0,3.0,ENSMUSE00000448840,2.0,CCDS14803,,
22,22,1,ensembl_havana,five_prime_utr,3671349,3671498,149,,-,0,...,ensembl_havana,protein_coding,"CCDS,basic",1.0,,,,CCDS14803,,
23,23,1,ensembl_havana,three_prime_utr,3214482,3216021,1539,,-,0,...,ensembl_havana,protein_coding,"CCDS,basic",1.0,,,,CCDS14803,,


In [11]:
del df_gtf

Convert gene lengths to kilobases

In [12]:
df_gtf_transcript_len = df_gtf_transcript_len / 1000
df_gtf_transcript_len

Unnamed: 0_level_0,feature_len
gene_id,Unnamed: 1_level_1
ENSMUSG00000000001,5.447
ENSMUSG00000000003,2.240
ENSMUSG00000000028,7.202
ENSMUSG00000000031,8.690
ENSMUSG00000000037,34.434
...,...
ENSMUSG00000118636,0.237
ENSMUSG00000118637,0.170
ENSMUSG00000118638,2.398
ENSMUSG00000118639,0.146


## Resolve Gene names

Load gene names

In [13]:
df_gene_names = pd.read_csv(gene_tsv_path, sep='\t', header=None)
df_gene_names.columns = ['gene_id', "gene_name"]
df_gene_names.index = df_gene_names.gene_id
df_gene_names = df_gene_names.drop('gene_id', axis=1)
print_rowcol('Loaded gene names', df_gene_names)
df_gene_names

Loaded gene names: 56289 rows x 1 cols


Unnamed: 0_level_0,gene_name
gene_id,Unnamed: 1_level_1
ENSMUSG00000064372,mt-Tp
ENSMUSG00000064371,mt-Tt
ENSMUSG00000064370,mt-Cytb
ENSMUSG00000064369,mt-Te
ENSMUSG00000064368,mt-Nd6
...,...
ENSMUSG00000082803,Gm26460
ENSMUSG00000118095,Gm50435
ENSMUSG00000106792,Hmgb1-ps4
ENSMUSG00000107003,2410152P15Rik


In [14]:
df_gene_name2id = pd.read_csv(gene_tsv_path, sep='\t', header=None)
df_gene_name2id.columns = ['gene_id', "gene_name"]
df_gene_name2id.index = df_gene_name2id.gene_name
df_gene_name2id = df_gene_name2id.drop('gene_name', axis=1)
print_rowcol('Loaded gene names', df_gene_name2id)
df_gene_name2id

Loaded gene names: 56289 rows x 1 cols


Unnamed: 0_level_0,gene_id
gene_name,Unnamed: 1_level_1
mt-Tp,ENSMUSG00000064372
mt-Tt,ENSMUSG00000064371
mt-Cytb,ENSMUSG00000064370
mt-Te,ENSMUSG00000064369
mt-Nd6,ENSMUSG00000064368
...,...
Gm26460,ENSMUSG00000082803
Gm50435,ENSMUSG00000118095
Hmgb1-ps4,ENSMUSG00000106792
2410152P15Rik,ENSMUSG00000107003


## Load in RNA-Seq data

### Yang

Read the yang1 data set and look at the data

In [15]:
df_tpm_1 = pd.read_csv(tpm_yang_path, sep='\t')
print_rowcol('Loaded Yang1', df_tpm_1)
df_tpm_1

Loaded Yang1: 46609 rows x 385 cols


Unnamed: 0,Gene_id,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
0,ENSMUSG00000000001_Gnai3,0.0,17.60,7.02,17.50,0.00,38.31,38.42,47.10,13.19,...,45.27,5.70,13.80,15.89,19.42,3.21,5.65,20.33,16.72,33.57
1,ENSMUSG00000000003_Pbsn,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028_Cdc45,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,8.07,0.00,0.00,0.00,2.55,0.00
3,ENSMUSG00000000031_H19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,112.13,...,0.00,6.80,0.00,4.14,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037_Scml2,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.47,0.00,0.00,0.00,0.00,0.00,0.40,0.00,1.32,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,0.0,0.00,42.44,3.28,0.00,0.00,9.45,9.12,0.00,...,5.41,1.76,0.00,0.00,13.15,40.04,9.40,0.00,18.05,0.00
46606,ERCC-00168,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.11,0.00,0.00,0.00,0.00,0.00,0.00


Clean the yang1 dataset to get the gene names, call `.shape` to check we havent filtered anything

In [16]:
# Split gene ids on _ and load into a new data frame and set the columns
split_data = pd.DataFrame(df_tpm_1.Gene_id.str.split("_", expand=True))
split_data.columns = ["Gene_id", "Gene_name", "Gene_name2"]

# Fill in the NA's with blank strings
split_data["Gene_name"] = split_data.Gene_name.fillna('')
split_data["Gene_name2"] = split_data.Gene_name2.fillna('')

# Concatinate the strings that have split more than once back to their standard for e.g GENEID_GENENAME_SOMEMORENAME
split_data["Gene_name"] = split_data.apply(lambda x: x.Gene_name if x.Gene_name2 == '' else x.Gene_name + '_' + x.Gene_name2, axis=1)
print_rowcol('Filter check', split_data)

Filter check: 46609 rows x 3 cols


Write the gene names back into the main dataset, print out dataset to check we do indeed have gene names where they were available

In [17]:
# Insert the columns back into the main data array
df_tpm_1["Gene_id"] = split_data.Gene_id
df_tpm_1.insert(1,"Gene_name", split_data.Gene_name)
df_tpm_1

Unnamed: 0,Gene_id,Gene_name,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
0,ENSMUSG00000000001,Gnai3,0.0,17.60,7.02,17.50,0.00,38.31,38.42,47.10,...,45.27,5.70,13.80,15.89,19.42,3.21,5.65,20.33,16.72,33.57
1,ENSMUSG00000000003,Pbsn,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028,Cdc45,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,8.07,0.00,0.00,0.00,2.55,0.00
3,ENSMUSG00000000031,H19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,6.80,0.00,4.14,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037,Scml2,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.47,0.00,0.00,0.00,0.00,0.00,0.40,0.00,1.32,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,,0.0,0.00,42.44,3.28,0.00,0.00,9.45,9.12,...,5.41,1.76,0.00,0.00,13.15,40.04,9.40,0.00,18.05,0.00
46606,ERCC-00168,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.11,0.00,0.00,0.00,0.00,0.00,0.00


Set gene name to index after checking it is unique

In [18]:
df_tpm_1.Gene_id.is_unique

True

In [19]:
df_tpm_1.index = df_tpm_1['Gene_id']
df_tpm_1 = df_tpm_1.drop('Gene_id', axis=1)
df_tpm_1 = df_tpm_1.drop('Gene_name', axis=1)
df_tpm_1.head()

Unnamed: 0_level_0,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,P1-3-A7,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
Gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,0.0,17.6,7.02,17.5,0.0,38.31,38.42,47.1,13.19,9.66,...,45.27,5.7,13.8,15.89,19.42,3.21,5.65,20.33,16.72,33.57
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,8.07,0.0,0.0,0.0,2.55,0.0
ENSMUSG00000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,112.13,0.0,...,0.0,6.8,0.0,4.14,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.47,0.0,0.0,0.0,0.0,0.0,0.4,0.0,1.32,0.0


Load in Yang2 

In [20]:
df_tpm_2 = pd.read_csv(tpm_yang_path2, sep='\t')
print_rowcol('Loaded Yang2', df_tpm_2)
df_tpm_2

Loaded Yang2: 46609 rows x 399 cols


Unnamed: 0,Gene_id,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,819b_A7,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
0,ENSMUSG00000000001_Gnai3,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,29.86,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
1,ENSMUSG00000000003_Pbsn,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028_Cdc45,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,10.69,0.00,0.00,0.00,0.00,9.33,0.00,0.00,1.41,0.00
3,ENSMUSG00000000031_H19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037_Scml2,13.75,0.00,0.00,4.76,0.00,0.00,6.73,0.00,6.97,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,131.21,90.43,13.88,58.42,86.68,75.96,76.02,380.24,9.93,...,72.18,82.65,3.33,106.09,0.00,60.24,0.00,0.00,75.92,0.00
46606,ERCC-00168,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,65.63,0.00,0.00,222.63,222.27,0.00,0.00,65.14,33.02,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [21]:
# Split gene ids on _ and load into a new data frame and set the columns
split_data = pd.DataFrame(df_tpm_2.Gene_id.str.split("_", expand=True))
split_data.columns = ["Gene_id", "Gene_name", "Gene_name2"]

# Fill in the NA's with blank strings
split_data["Gene_name"] = split_data.Gene_name.fillna('')
split_data["Gene_name2"] = split_data.Gene_name2.fillna('')

# Concatinate the strings that have split more than once back to their standard for e.g GENEID_GENENAME_SOMEMORENAME
split_data["Gene_name"] = split_data.apply(lambda x: x.Gene_name if x.Gene_name2 == '' else x.Gene_name + '_' + x.Gene_name2, axis=1)
print_rowcol('Filter check', split_data)

Filter check: 46609 rows x 3 cols


In [22]:
# Insert the columns back into the main data array
df_tpm_2["Gene_id"] = split_data.Gene_id
df_tpm_2.insert(1,"Gene_name", split_data.Gene_name)
df_tpm_2

Unnamed: 0,Gene_id,Gene_name,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
0,ENSMUSG00000000001,Gnai3,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
1,ENSMUSG00000000003,Pbsn,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028,Cdc45,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,10.69,0.00,0.00,0.00,0.00,9.33,0.00,0.00,1.41,0.00
3,ENSMUSG00000000031,H19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037,Scml2,13.75,0.00,0.00,4.76,0.00,0.00,6.73,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,,131.21,90.43,13.88,58.42,86.68,75.96,76.02,380.24,...,72.18,82.65,3.33,106.09,0.00,60.24,0.00,0.00,75.92,0.00
46606,ERCC-00168,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,,65.63,0.00,0.00,222.63,222.27,0.00,0.00,65.14,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


Set gene name to index after checking it is unique

In [23]:
df_tpm_2.Gene_id.is_unique

True

In [24]:
df_tpm_2.index = df_tpm_2['Gene_id']
df_tpm_2 = df_tpm_2.drop('Gene_id', axis=1)
df_tpm_2 = df_tpm_2.drop('Gene_name', axis=1)
df_tpm_2.head()

Unnamed: 0_level_0,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,819b_A7,819b_A8,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
Gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,29.86,71.02,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
ENSMUSG00000000003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000028,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.69,0.0,0.0,0.0,0.0,9.33,0.0,0.0,1.41,0.0
ENSMUSG00000000031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ENSMUSG00000000037,13.75,0.0,0.0,4.76,0.0,0.0,6.73,0.0,6.97,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Joost / Kasper

Load in data

In [25]:
df_rpk = pd.read_csv(tpm_joost_path, sep='\t')
print_rowcol('Loaded Kasper', df_rpk)
df_rpk

Loaded Kasper: 26024 rows x 1423 cols


Unnamed: 0,Gene\Cell,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
0,ERCC-00002,363,330,267,262,334,328,280,318,297,...,945,850,696,544,701,817,601,687,589,229
1,ERCC-00003,20,17,14,16,20,26,14,8,25,...,82,79,56,36,54,92,30,64,47,32
2,ERCC-00004,276,258,215,190,244,259,209,235,237,...,1000,925,755,667,745,1051,609,749,534,287
3,ERCC-00009,25,14,14,7,20,31,20,26,24,...,56,49,42,27,33,49,30,44,27,13
4,ERCC-00012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26019,r_MurSAT1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
26020,r_HY5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26021,r_tRNA-Arg-CGA_,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26022,r_U14,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Convert from FPKM (Fragments Per Kilobase Million) to TPM (Transcripts Per Kilobase Million)

Rename gene/cell column

In [26]:
df_rpk.rename( columns={'Gene\Cell':'gene_name'}, inplace=True)
df_rpk.head()

Unnamed: 0,gene_name,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
0,ERCC-00002,363,330,267,262,334,328,280,318,297,...,945,850,696,544,701,817,601,687,589,229
1,ERCC-00003,20,17,14,16,20,26,14,8,25,...,82,79,56,36,54,92,30,64,47,32
2,ERCC-00004,276,258,215,190,244,259,209,235,237,...,1000,925,755,667,745,1051,609,749,534,287
3,ERCC-00009,25,14,14,7,20,31,20,26,24,...,56,49,42,27,33,49,30,44,27,13
4,ERCC-00012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Set gene to index after checking unique

In [27]:
df_rpk.gene_name.is_unique

True

In [28]:
df_rpk.index = df_rpk['gene_name']
df_rpk.head()

Unnamed: 0_level_0,gene_name,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ERCC-00002,ERCC-00002,363,330,267,262,334,328,280,318,297,...,945,850,696,544,701,817,601,687,589,229
ERCC-00003,ERCC-00003,20,17,14,16,20,26,14,8,25,...,82,79,56,36,54,92,30,64,47,32
ERCC-00004,ERCC-00004,276,258,215,190,244,259,209,235,237,...,1000,925,755,667,745,1051,609,749,534,287
ERCC-00009,ERCC-00009,25,14,14,7,20,31,20,26,24,...,56,49,42,27,33,49,30,44,27,13
ERCC-00012,ERCC-00012,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Insert gene_id into data

In [29]:
df_rpk_merged = df_rpk
df_rpk_merged = df_rpk_merged.drop('gene_name', axis=1)
df_rpk_merged = df_gene_name2id.join(df_rpk_merged, lsuffix='', rsuffix='', how='inner')
df_rpk_merged.index = df_rpk_merged.gene_id
df_rpk_merged = df_rpk_merged.drop('gene_id', axis=1)
print_rowcol('Shape after merging gene ids', df_rpk_merged)
df_rpk_merged

Shape after merging gene ids: 21827 rows x 1422 cols


Unnamed: 0_level_0,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,1772067055_B03,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000109644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000007777,1,1,0,0,0,0,3,0,0,0,...,0,0,0,0,1,0,0,0,0,2
ENSMUSG00000043644,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000042208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000020831,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000064365,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000064371,0,0,0,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
ENSMUSG00000064338,0,0,2,0,1,2,1,0,3,0,...,3,3,0,1,1,5,0,3,1,0
ENSMUSG00000064346,0,2,2,0,0,1,0,0,0,2,...,3,1,0,3,0,0,1,0,2,0


Get length of each gene into data frame

In [30]:
df_rpk_merged_len = df_gtf_transcript_len.join(df_rpk_merged, lsuffix='', rsuffix='', how='inner')
print_rowcol('Shape after merging gene feature_len', df_rpk_merged_len)
df_rpk_merged_len

Shape after merging gene feature_len: 21522 rows x 1423 cols


Unnamed: 0_level_0,feature_len,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,5.447,1,0,1,0,0,0,1,0,0,...,4,1,3,0,0,1,0,2,0,0
ENSMUSG00000000003,2.240,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000028,7.202,0,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000031,8.690,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000000037,34.434,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000118401,2.185,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000118406,0.396,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSMUSG00000118491,4.836,0,0,0,2,0,2,0,0,0,...,2,2,3,0,0,1,0,1,0,1
ENSMUSG00000118506,6.805,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Sum the read counts per sample

In [31]:
df_scaling_factor = pd.DataFrame(df_rpk_merged_len.sum(axis=0) / 1000000)
df_scaling_factor.columns = ['scaling_factor']
df_scaling_factor = df_scaling_factor.drop('feature_len')
df_scaling_factor

Unnamed: 0,scaling_factor
1772067055_A01,0.004375
1772067055_A03,0.004793
1772067055_A04,0.005413
1772067055_A05,0.002196
1772067055_A06,0.003068
...,...
1772072285_D09,0.007320
1772072285_D12,0.003783
1772072285_E02,0.011444
1772072285_E06,0.003907


Divide the read counts by the length of each gene in kilobases. This gives you reads per kilobase (RPK)

In [32]:
df_rpk_merged_len = df_rpk_merged_len.iloc[:,1:].div(df_rpk_merged_len.feature_len, axis=0)
df_rpk_merged_len

Unnamed: 0_level_0,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,1772067055_B03,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,0.183587,0.0,0.183587,0.000000,0.0,0.000000,0.183587,0.000000,0.0,0.0,...,0.734349,0.183587,0.550762,0.0,0.0,0.183587,0.0,0.367175,0.0,0.000000
ENSMUSG00000000003,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000028,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.416551,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000031,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000037,0.000000,0.0,0.000000,0.029041,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000118401,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000118406,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000118491,0.000000,0.0,0.000000,0.413565,0.0,0.413565,0.000000,0.000000,0.0,0.0,...,0.413565,0.413565,0.620347,0.0,0.0,0.206782,0.0,0.206782,0.0,0.206782
ENSMUSG00000118506,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000


Divide the RPK values by the “per million” scaling factor. This gives you TPM.

In [33]:
df_tpm_3 = df_rpk_merged_len.div(df_scaling_factor.scaling_factor, axis=1)
df_tpm_3

Unnamed: 0_level_0,1772067055_A01,1772067055_A03,1772067055_A04,1772067055_A05,1772067055_A06,1772067055_A07,1772067055_A09,1772067055_B01,1772067055_B02,1772067055_B03,...,1772072285_C12,1772072285_D01,1772072285_D03,1772072285_D05,1772072285_D06,1772072285_D09,1772072285_D12,1772072285_E02,1772072285_E06,1772072285_G06
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,41.96281,0.0,33.915998,0.000000,0.0,0.000000,21.552864,0.000000,0.0,0.0,...,80.099169,31.137601,127.520696,0.0,0.0,25.080232,0.0,32.084463,0.0,0.000000
ENSMUSG00000000003,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000028,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,93.649046,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000031,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000000037,0.00000,0.0,0.000000,13.224528,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000118401,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000118406,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000
ENSMUSG00000118491,0.00000,0.0,0.000000,188.326471,0.0,34.369229,0.000000,0.000000,0.0,0.0,...,45.109613,70.143306,143.632182,0.0,0.0,28.248971,0.0,18.069072,0.0,80.648387
ENSMUSG00000118506,0.00000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000


## Ghahramani

Load in dataset

In [34]:
df_tpm_4 = pd.read_csv(tpm_ghahramani_path, sep=',')
print_rowcol('Loaded Ghahramani', df_tpm_4)
df_tpm_4

Loaded Ghahramani: 14145 rows x 295 cols


Unnamed: 0.1,Unnamed: 0,AG1neg_1,AG2pos_2,AG1neg_3,AG2pos_4,AG1neg_5,AG2pos_6,AG1neg_7,AG2pos_8,AG1neg_9,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
0,Gnai3,122.014549,7.484987,597.717004,643.657745,0.000000,331.238736,117.133604,468.887523,243.073894,...,61.411894,230.333860,98.040207,93.716396,673.177033,50.220174,303.137672,85.992001,14.915042,4.671418
1,Cdc45,0.000000,0.000000,0.000000,5.560498,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.958870,2.201127,1.425608,222.350975,10.621630,433.142296,1.590745,540.554230,2.841134,0.000000
2,Apoh,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,2.569181,0.000000,0.000000,174.856508
3,Narf,1.475207,7.818909,111.868532,117.207838,8.548119,34.838241,49.003077,50.385245,107.228871,...,7.230203,2.514732,1.954466,2.185207,8.604779,3.643098,219.721960,0.000000,0.000000,1.016629
4,Cav2,160.239035,278.810852,112.274304,41.820736,0.000000,28.127109,108.345577,65.333237,4.705999,...,2.163511,0.000000,40.207721,1.198789,241.109633,0.000000,0.598203,221.756227,59.831210,7.808018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14140,Exosc6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,37.295445,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14141,Pde2a,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
14142,Flt3l,0.000000,0.609519,0.000000,0.000000,0.000000,0.000000,18.984288,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.020048,0.000000,0.000000,0.000000
14143,Rnf223,0.000000,0.000000,0.000000,11.030893,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Rename gene name column

In [35]:
df_tpm_4.rename( columns={'Unnamed: 0':'gene_name'}, inplace=True)
df_tpm_4.head()

Unnamed: 0,gene_name,AG1neg_1,AG2pos_2,AG1neg_3,AG2pos_4,AG1neg_5,AG2pos_6,AG1neg_7,AG2pos_8,AG1neg_9,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
0,Gnai3,122.014549,7.484987,597.717004,643.657745,0.0,331.238736,117.133604,468.887523,243.073894,...,61.411894,230.33386,98.040207,93.716396,673.177033,50.220174,303.137672,85.992001,14.915042,4.671418
1,Cdc45,0.0,0.0,0.0,5.560498,0.0,0.0,0.0,0.0,0.0,...,0.95887,2.201127,1.425608,222.350975,10.62163,433.142296,1.590745,540.55423,2.841134,0.0
2,Apoh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.569181,0.0,0.0,174.856508
3,Narf,1.475207,7.818909,111.868532,117.207838,8.548119,34.838241,49.003077,50.385245,107.228871,...,7.230203,2.514732,1.954466,2.185207,8.604779,3.643098,219.72196,0.0,0.0,1.016629
4,Cav2,160.239035,278.810852,112.274304,41.820736,0.0,28.127109,108.345577,65.333237,4.705999,...,2.163511,0.0,40.207721,1.198789,241.109633,0.0,0.598203,221.756227,59.83121,7.808018


Set gene to index after checking unique

In [36]:
df_tpm_4.gene_name.is_unique

True

In [37]:
df_tpm_4.index = df_tpm_4['gene_name']
df_tpm_4.head()

Unnamed: 0_level_0,gene_name,AG1neg_1,AG2pos_2,AG1neg_3,AG2pos_4,AG1neg_5,AG2pos_6,AG1neg_7,AG2pos_8,AG1neg_9,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gnai3,Gnai3,122.014549,7.484987,597.717004,643.657745,0.0,331.238736,117.133604,468.887523,243.073894,...,61.411894,230.33386,98.040207,93.716396,673.177033,50.220174,303.137672,85.992001,14.915042,4.671418
Cdc45,Cdc45,0.0,0.0,0.0,5.560498,0.0,0.0,0.0,0.0,0.0,...,0.95887,2.201127,1.425608,222.350975,10.62163,433.142296,1.590745,540.55423,2.841134,0.0
Apoh,Apoh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.569181,0.0,0.0,174.856508
Narf,Narf,1.475207,7.818909,111.868532,117.207838,8.548119,34.838241,49.003077,50.385245,107.228871,...,7.230203,2.514732,1.954466,2.185207,8.604779,3.643098,219.72196,0.0,0.0,1.016629
Cav2,Cav2,160.239035,278.810852,112.274304,41.820736,0.0,28.127109,108.345577,65.333237,4.705999,...,2.163511,0.0,40.207721,1.198789,241.109633,0.0,0.598203,221.756227,59.83121,7.808018


Insert gene into data

In [38]:
df_tpm_4 = df_tpm_4.drop('gene_name', axis=1)
df_tpm_4 = df_gene_name2id.join(df_tpm_4, lsuffix='', rsuffix='', how='inner')
df_tpm_4.index = df_tpm_4.gene_id
df_tpm_4 = df_tpm_4.drop('gene_id', axis=1)
print_rowcol('Shape after merging gene ids', df_tpm_4)
df_tpm_4.head()

Shape after merging gene ids: 14081 rows x 294 cols


Unnamed: 0_level_0,AG1neg_1,AG2pos_2,AG1neg_3,AG2pos_4,AG1neg_5,AG2pos_6,AG1neg_7,AG2pos_8,AG1neg_9,AG2pos_10,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000007777,428.888448,127.707832,25.495415,84.045924,27.923632,2.147246,235.262386,154.615303,360.786526,235.518628,...,137.416574,351.590275,100.556455,249.840059,261.627303,309.417973,0.0,427.928611,25.447842,0.0
ENSMUSG00000042208,0.0,2.373457,0.0,17.226978,0.0,0.0,4.808099,20.186837,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,120.909022,0.0,0.0,1.576499,0.0
ENSMUSG00000058706,3.12615,28.478409,5.513106,7.067672,9.057271,42.485137,24.780884,0.0,5.112709,10.485246,...,0.0,10.391618,2.32974,0.289421,1.051997,36.670875,4.91037,0.0,0.0,0.538591
ENSMUSG00000108680,3.12615,28.478409,5.513106,7.067672,9.057271,42.485137,24.780884,0.0,5.112709,10.485246,...,0.0,10.391618,2.32974,0.289421,1.051997,36.670875,4.91037,0.0,0.0,0.538591
ENSMUSG00000060512,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Create merged dataset

Create merged dataset from all subsets

In [39]:
df_tpm_combined = df_tpm_1.join(df_tpm_2, lsuffix='', rsuffix='_other', how='inner')
df_tpm_combined.shape

(46609, 782)

In [40]:
df_tpm_combined = df_tpm_combined.join(df_tpm_3, lsuffix='', rsuffix='_other', how='inner')
df_tpm_combined.shape

(21016, 2204)

In [41]:
df_tpm_combined = df_tpm_combined.join(df_tpm_4, lsuffix='', rsuffix='_other', how='inner')
df_tpm_combined.shape

(13227, 2498)

Create `log2(TPM+1)` dataset

In [42]:
df_ltpm_combined = np.log2(df_tpm_combined + 1)
df_ltpm_combined

Unnamed: 0,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,P1-3-A7,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
ENSMUSG00000000001,0.0,4.217231,3.003602,4.209453,0.000000,5.296824,5.300856,5.587965,3.826803,3.414136,...,5.963749,7.853833,6.629942,6.565542,9.396984,5.678640,8.248581,6.442811,3.992319,2.503710
ENSMUSG00000000028,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.970022,1.678580,1.278347,7.803169,3.538741,8.762024,1.373367,9.080962,1.941532,0.000000
ENSMUSG00000000049,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.835593,0.000000,0.000000,7.458255
ENSMUSG00000000056,0.0,3.109361,5.938286,5.093391,0.000000,0.000000,2.693766,4.627607,6.537141,3.842979,...,3.040928,1.813415,1.562897,1.671387,3.263752,2.215088,7.786086,0.000000,0.000000,1.011946
ENSMUSG00000000058,0.0,1.778209,2.601697,0.000000,7.143842,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.661527,0.000000,5.364843,1.136709,7.919517,0.000000,0.676451,7.799322,5.926740,3.138817
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000106918,0.0,5.926711,0.000000,4.521679,0.000000,0.000000,4.028569,0.000000,0.000000,5.384395,...,3.217381,4.115454,1.121270,3.643718,4.947408,6.036318,1.982202,4.864736,5.896306,4.078610
ENSMUSG00000107283,0.0,4.424922,2.443607,1.584963,0.000000,0.000000,4.503349,3.740928,5.351558,3.264536,...,0.000000,4.267812,4.150363,4.719657,6.472461,5.712798,1.906817,5.473054,6.433896,5.444845
ENSMUSG00000107417,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000107499,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Filter for genes and cells which match filtering criteria. Call `.shape` to check what was filtered

In [43]:
df_expression_filt_mask = df_ltpm_combined > min_ltpm_exp
df_ltpm_combined_genefilt = df_ltpm_combined[df_expression_filt_mask.sum(axis=1) > min_num_cells_for_gene_exp]
df_ltpm_combined_genefilt.shape

#df_expression_filt_mask = df_ltpm_combined > 0
#df_ltpm_combined_genefilt = df_ltpm_combined[df_expression_filt_mask.sum(axis=1) > 1000]
#df_ltpm_combined_genefilt.shape

(6807, 2498)

In [44]:
df_ltpm_combined_genecellfilt = df_ltpm_combined_genefilt \
    .T[(df_ltpm_combined_genefilt > min_ltpm_exp).sum(axis=0) > min_num_genes_in_cell_exp]
df_ltpm_combined_genecellfilt = df_ltpm_combined_genecellfilt.T
df_ltpm_combined_genecellfilt.shape

#df_ltpm_combined_genecellfilt = df_ltpm_combined_genefilt \
#    .T[(df_ltpm_combined_genefilt > 1).sum(axis=0) > 500]
#df_ltpm_combined_genecellfilt = df_ltpm_combined_genecellfilt.T
#df_ltpm_combined_genecellfilt.shape

(6807, 2298)

## Normalise data

Check max values

In [45]:
data_max = df_ltpm_combined_genecellfilt.max()
data_max = data_max.max()
print(data_max)

18.320341299017063


Normalise data

In [46]:
np_data = df_ltpm_combined_genecellfilt.T.values
scaler = MinMaxScaler()
print(scaler.fit(np_data))

# Check which dimension we are fitting to - if we are fitting to gene expression then should be equal to number of genes
print(scaler.data_max_.shape)

MinMaxScaler(copy=True, feature_range=(0, 1))
(6807,)


In [47]:
np_data_norm = np.transpose(scaler.transform(np_data))
np_data_norm.shape

(6807, 2298)

In [48]:
df_ltpm_combined_norm = pd.DataFrame(np_data_norm)
df_ltpm_combined_norm.columns = df_ltpm_combined_genecellfilt.columns
df_ltpm_combined_norm.index = df_ltpm_combined_genecellfilt.index
df_ltpm_combined_norm

Unnamed: 0,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,P1-3-A7,P1-3-A8,...,GHA501A86,GHA501A88,GHA501A89,GHA501A90,GHA501A93,GHA501A94,GHA501A95,GHA501A98,GHA501A99,GHA501A9
ENSMUSG00000000001,0.405649,0.288912,0.404901,0.000000,0.509494,0.509881,0.537498,0.368094,0.328401,0.470171,...,0.573644,0.755448,0.637724,0.631530,0.903882,0.546220,0.793419,0.619724,0.384015,0.240828
ENSMUSG00000000056,0.354969,0.677923,0.581469,0.000000,0.000000,0.307524,0.528294,0.746289,0.438720,0.318122,...,0.347157,0.207022,0.178423,0.190808,0.372595,0.252878,0.888871,0.000000,0.000000,0.115525
ENSMUSG00000000058,0.184017,0.269236,0.000000,0.739278,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.171943,0.000000,0.555179,0.117632,0.819548,0.000000,0.070002,0.807110,0.613327,0.324819
ENSMUSG00000000078,0.000000,0.000000,0.265407,0.000000,0.000000,0.049705,0.280146,0.000000,0.130612,0.076877,...,0.569237,0.538678,0.584073,0.598017,0.254595,0.694776,0.500197,0.189335,0.787864,0.565605
ENSMUSG00000000085,0.000000,0.557392,0.442670,0.000000,0.000000,0.000000,0.513107,0.662761,0.228048,0.000000,...,0.083766,0.501817,0.286331,0.000000,0.000000,0.000000,0.000000,0.000000,0.367558,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000101892,0.500886,0.000000,0.315071,0.000000,0.000000,0.427417,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.498924,0.148768,0.549690,0.506464,0.668826,0.623066,0.000000,0.817074,0.344403
ENSMUSG00000102976,0.426120,0.260469,0.425397,0.490006,0.504198,0.362395,0.373281,0.483482,0.383271,0.485597,...,0.490887,0.421085,0.400355,0.352684,0.356345,0.624218,0.459310,0.362824,0.124659,0.537819
ENSMUSG00000106864,0.457133,0.329494,0.197375,0.000000,0.000000,0.656869,0.320657,0.538464,0.800126,0.241875,...,0.378541,0.027108,0.325596,0.168844,0.110277,0.461184,0.112672,0.424511,0.222422,0.281608
ENSMUSG00000106918,0.736540,0.000000,0.561930,0.000000,0.000000,0.500649,0.000000,0.000000,0.669144,0.599597,...,0.399839,0.511447,0.139346,0.452822,0.614838,0.750162,0.246338,0.604564,0.732762,0.506868


Check new max

In [49]:
data_max = df_ltpm_combined_norm.max()
data_max = data_max.max()
print(data_max)

1.0


## Split train and test data sets

In [50]:
# Randomly select test and training data
train_features, test_features = train_test_split(df_ltpm_combined_norm.T, test_size=test_data_size)
train_features = train_features.T
test_features = test_features.T

print_rowcol('Created training dataset', train_features)
print_rowcol('Created test dataset', test_features)

Created training dataset: 6807 rows x 1798 cols
Created test dataset: 6807 rows x 500 cols


## Write data to file

Get the column and row names as a list

In [51]:
train_df_column_names = pd.DataFrame(list(train_features.columns.values))
train_df_row_names = pd.DataFrame(list(train_features.index.values))
test_df_column_names = pd.DataFrame(list(test_features.columns.values))
test_df_row_names = pd.DataFrame(list(test_features.index.values))

print(train_df_column_names.shape)
print(train_df_row_names.shape)
print(test_df_column_names.shape)
print(test_df_row_names.shape)

(1798, 1)
(6807, 1)
(500, 1)
(6807, 1)


Write the data to file

In [52]:
train_features.to_csv(data_path + '/tpm_combined.csv', index=False, header=False)
train_df_column_names.to_csv(data_path + '/tpm_combined_cols.csv', index=False, header=False)
train_df_row_names.to_csv(data_path + '/tpm_combined_rows.csv', index=False, header=False)

test_features.to_csv(data_path + '/tpm_combined_test.csv', index=False, header=False)
test_df_column_names.to_csv(data_path + '/tpm_combined_cols_test.csv', index=False, header=False)
test_df_row_names.to_csv(data_path + '/tpm_combined_rows_test.csv', index=False, header=False)