# Data pre-processing

## Setup -CHECK SPELLING MISTAKES

In [1]:
# Install a pip package in the current Jupyter kernel - REMOVE IN PROD
#import sys
#!{sys.executable} -m pip install gtfparse

Import relevant libraries

In [2]:
# Import libs
import os.path
from os import path
import gzip
import pandas as pd
import numpy as np
import urllib.request
from gtfparse import read_gtf
from biomart import BiomartServer

Setup paths - main data path is folder inside the code repository where the data dowloaded from download-data has been stored

In [3]:
data_path = '../data'
gtf_path_gz = data_path + '/Mus_musculus.GRCm38.99.gtf.gz'
gtf_path = data_path + '/Mus_musculus.GRCm38.99.gtf'
tpm_yang_path_gz = data_path + '/GSE90848_Ana6_basal_hair_bulb_TPM.txt.gz'
tpm_yang_path = data_path + '/GSE90848_Ana6_basal_hair_bulb_TPM.txt'
tpm_yang_path2_gz = data_path + '/GSE90848_Tel_Ana1_Ana2_bulge_HG_basal_HB_TPM.txt.gz'
tpm_yang_path2 = data_path + '/GSE90848_Tel_Ana1_Ana2_bulge_HG_basal_HB_TPM.txt'
gene_tsv_path = data_path + '/gene_names.tsv'

Set the main thresholds for the pre-processing
- min_num_genes_in_cell_exp - is the min number of genes that need to have expression > **min_ltpm_exp** for it to be considered a valid cell
- min_num_cells_for_gene_exp - is the min number of cells that need to have an expression for that gene > **min_ltpm_exp** for it to be considered a valid gene

In [4]:
# Thresholds
min_num_genes_in_cell_exp = 1000
min_num_cells_for_gene_exp = 10
min_ltpm_exp = 1

## Resolve Gene names

Load GTF data and show some data

In [5]:
#df_gtf = read_gtf(gtf_path)
#df_gtf

Filter GTF data for exons, 3' and 5' UTRs, call `.shape` to see how many rows we have

In [6]:
#df_gtf_filt = df_gtf[(df_gtf.feature=='exon') | (df_gtf.feature=='three_prime_utr') | (df_gtf.feature=='five_prime_utr')]
#df_gtf_filt.shape

## Load in RNA-Seq data

Read the yang1 data set and look at the data

In [7]:
df_tpm_1 = pd.read_csv(tpm_yang_path, sep='\t')
df_tpm_1

Unnamed: 0,Gene_id,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
0,ENSMUSG00000000001_Gnai3,0.0,17.60,7.02,17.50,0.00,38.31,38.42,47.10,13.19,...,45.27,5.70,13.80,15.89,19.42,3.21,5.65,20.33,16.72,33.57
1,ENSMUSG00000000003_Pbsn,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028_Cdc45,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,8.07,0.00,0.00,0.00,2.55,0.00
3,ENSMUSG00000000031_H19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,112.13,...,0.00,6.80,0.00,4.14,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037_Scml2,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.47,0.00,0.00,0.00,0.00,0.00,0.40,0.00,1.32,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,0.0,0.00,42.44,3.28,0.00,0.00,9.45,9.12,0.00,...,5.41,1.76,0.00,0.00,13.15,40.04,9.40,0.00,18.05,0.00
46606,ERCC-00168,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.11,0.00,0.00,0.00,0.00,0.00,0.00


Clean the yang1 dataset to get the gene names, call `.shape` to check we havent filtered anything

In [8]:
# Split gene ids on _ and load into a new data frame and set the columns
split_data = pd.DataFrame(df_tpm_1.Gene_id.str.split("_", expand=True))
split_data.columns = ["Gene_id", "Gene_name", "Gene_name2"]

# Fill in the NA's with blank strings
split_data["Gene_name"] = split_data.Gene_name.fillna('')
split_data["Gene_name2"] = split_data.Gene_name2.fillna('')

# Concatinate the strings that have split more than once back to their standard for e.g GENEID_GENENAME_SOMEMORENAME
split_data["Gene_name"] = split_data.apply(lambda x: x.Gene_name if x.Gene_name2 == '' else x.Gene_name + '_' + x.Gene_name2, axis=1)
split_data.shape

(46609, 3)

Write the gene names back into the main dataset, print out dataset to check we do indeed have gene names where they were available

In [9]:
# Insert the columns back into the main data array
df_tpm_1["Gene_id"] = split_data.Gene_id
df_tpm_1.insert(1,"Gene_name", split_data.Gene_name)
df_tpm_1

Unnamed: 0,Gene_id,Gene_name,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
0,ENSMUSG00000000001,Gnai3,0.0,17.60,7.02,17.50,0.00,38.31,38.42,47.10,...,45.27,5.70,13.80,15.89,19.42,3.21,5.65,20.33,16.72,33.57
1,ENSMUSG00000000003,Pbsn,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028,Cdc45,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,8.07,0.00,0.00,0.00,2.55,0.00
3,ENSMUSG00000000031,H19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,6.80,0.00,4.14,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037,Scml2,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.47,0.00,0.00,0.00,0.00,0.00,0.40,0.00,1.32,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,,0.0,0.00,42.44,3.28,0.00,0.00,9.45,9.12,...,5.41,1.76,0.00,0.00,13.15,40.04,9.40,0.00,18.05,0.00
46606,ERCC-00168,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.11,0.00,0.00,0.00,0.00,0.00,0.00


In [10]:
df_tpm_1.Gene_id.is_unique

True

In [11]:
df_tpm_1.index = df_tpm_1['Gene_id']
df_tpm_1

Unnamed: 0_level_0,Gene_id,Gene_name,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,...,P2-1-H11,P2-1-H12,P2-1-H2,P2-1-H3,P2-1-H4,P2-1-H5,P2-1-H6,P2-1-H7,P2-1-H8,P2-1-H9
Gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,ENSMUSG00000000001,Gnai3,0.0,17.60,7.02,17.50,0.00,38.31,38.42,47.10,...,45.27,5.70,13.80,15.89,19.42,3.21,5.65,20.33,16.72,33.57
ENSMUSG00000000003,ENSMUSG00000000003,Pbsn,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ENSMUSG00000000028,ENSMUSG00000000028,Cdc45,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,8.07,0.00,0.00,0.00,2.55,0.00
ENSMUSG00000000031,ENSMUSG00000000031,H19,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,6.80,0.00,4.14,0.00,0.00,0.00,0.00,0.00,0.00
ENSMUSG00000000037,ENSMUSG00000000037,Scml2,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,1.47,0.00,0.00,0.00,0.00,0.00,0.40,0.00,1.32,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERCC-00164,ERCC-00164,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ERCC-00165,ERCC-00165,,0.0,0.00,42.44,3.28,0.00,0.00,9.45,9.12,...,5.41,1.76,0.00,0.00,13.15,40.04,9.40,0.00,18.05,0.00
ERCC-00168,ERCC-00168,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ERCC-00170,ERCC-00170,,0.0,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,2.11,0.00,0.00,0.00,0.00,0.00,0.00


NEXT DATASET

In [12]:
df_tpm_2 = pd.read_csv(tpm_yang_path2, sep='\t')
df_tpm_2

Unnamed: 0,Gene_id,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,819b_A7,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
0,ENSMUSG00000000001_Gnai3,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,29.86,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
1,ENSMUSG00000000003_Pbsn,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028_Cdc45,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,10.69,0.00,0.00,0.00,0.00,9.33,0.00,0.00,1.41,0.00
3,ENSMUSG00000000031_H19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037_Scml2,13.75,0.00,0.00,4.76,0.00,0.00,6.73,0.00,6.97,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,131.21,90.43,13.88,58.42,86.68,75.96,76.02,380.24,9.93,...,72.18,82.65,3.33,106.09,0.00,60.24,0.00,0.00,75.92,0.00
46606,ERCC-00168,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,65.63,0.00,0.00,222.63,222.27,0.00,0.00,65.14,33.02,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [13]:
# Split gene ids on _ and load into a new data frame and set the columns
split_data = pd.DataFrame(df_tpm_2.Gene_id.str.split("_", expand=True))
split_data.columns = ["Gene_id", "Gene_name", "Gene_name2"]

# Fill in the NA's with blank strings
split_data["Gene_name"] = split_data.Gene_name.fillna('')
split_data["Gene_name2"] = split_data.Gene_name2.fillna('')

# Concatinate the strings that have split more than once back to their standard for e.g GENEID_GENENAME_SOMEMORENAME
split_data["Gene_name"] = split_data.apply(lambda x: x.Gene_name if x.Gene_name2 == '' else x.Gene_name + '_' + x.Gene_name2, axis=1)
split_data.shape

(46609, 3)

In [14]:
# Insert the columns back into the main data array
df_tpm_2["Gene_id"] = split_data.Gene_id
df_tpm_2.insert(1,"Gene_name", split_data.Gene_name)
df_tpm_2

Unnamed: 0,Gene_id,Gene_name,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
0,ENSMUSG00000000001,Gnai3,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
1,ENSMUSG00000000003,Pbsn,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
2,ENSMUSG00000000028,Cdc45,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,10.69,0.00,0.00,0.00,0.00,9.33,0.00,0.00,1.41,0.00
3,ENSMUSG00000000031,H19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
4,ENSMUSG00000000037,Scml2,13.75,0.00,0.00,4.76,0.00,0.00,6.73,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46604,ERCC-00164,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46605,ERCC-00165,,131.21,90.43,13.88,58.42,86.68,75.96,76.02,380.24,...,72.18,82.65,3.33,106.09,0.00,60.24,0.00,0.00,75.92,0.00
46606,ERCC-00168,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
46607,ERCC-00170,,65.63,0.00,0.00,222.63,222.27,0.00,0.00,65.14,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [15]:
#Check gene column is unique before setting is as an index
df_tpm_2.Gene_id.is_unique

True

In [16]:
df_tpm_2.index = df_tpm_2['Gene_id']
df_tpm_2

Unnamed: 0_level_0,Gene_id,Gene_name,819b_A1,819b_A10,819b_A11,819b_A2,819b_A3,819b_A4,819b_A5,819b_A6,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
Gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,ENSMUSG00000000001,Gnai3,51.73,26.72,29.77,68.41,148.98,55.76,0.61,44.96,...,20.73,12.29,9.18,14.41,10.34,2.98,0.67,28.88,23.65,21.83
ENSMUSG00000000003,ENSMUSG00000000003,Pbsn,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ENSMUSG00000000028,ENSMUSG00000000028,Cdc45,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,10.69,0.00,0.00,0.00,0.00,9.33,0.00,0.00,1.41,0.00
ENSMUSG00000000031,ENSMUSG00000000031,H19,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ENSMUSG00000000037,ENSMUSG00000000037,Scml2,13.75,0.00,0.00,4.76,0.00,0.00,6.73,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERCC-00164,ERCC-00164,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ERCC-00165,ERCC-00165,,131.21,90.43,13.88,58.42,86.68,75.96,76.02,380.24,...,72.18,82.65,3.33,106.09,0.00,60.24,0.00,0.00,75.92,0.00
ERCC-00168,ERCC-00168,,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
ERCC-00170,ERCC-00170,,65.63,0.00,0.00,222.63,222.27,0.00,0.00,65.14,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


## Create merged dataset

Create merged dataset from all subsets

In [17]:
df_tpm_combined = df_tpm_1.join(df_tpm_2, lsuffix='', rsuffix='_other', how='inner')
df_tpm_combined.drop(['Gene_id_other', 'Gene_name_other'], axis=1)
df_tpm_combined.shape

(46609, 786)

Create data only set (this is easier for tensorflow to deal with)

In [18]:
df_tpm_combined_dataonly = df_tpm_combined.drop(['Gene_id', 'Gene_name'], axis=1)
df_tpm_combined_dataonly = df_tpm_combined_dataonly.select_dtypes(exclude=['object'])
df_tpm_combined_dataonly.shape

(46609, 782)

Create `log2(TPM+1)` dataset

In [19]:
df_ltpm_combined_dataonly = np.log2(df_tpm_combined_dataonly + 1)
df_ltpm_combined_dataonly

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


Unnamed: 0_level_0,P1-3-A1,P1-3-A10,P1-3-A11,P1-3-A12,P1-3-A2,P1-3-A3,P1-3-A4,P1-3-A5,P1-3-A6,P1-3-A7,...,920mat_G7,920mat_G8,920mat_G9,920mat_H11,920mat_H3,920mat_H4,920mat_H5,920mat_H6,920mat_H7,920mat_H8
Gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSMUSG00000000001,0.0,4.217231,3.003602,4.209453,0.000000,5.296824,5.300856,5.587965,3.826803,3.414136,...,4.441616,3.732269,3.347666,3.945795,3.503349,1.992768,0.739848,4.901108,4.623516,4.512859
ENSMUSG00000000003,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000000028,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,3.547203,0.000000,0.000000,0.000000,0.000000,3.368768,0.000000,0.000000,1.269033,0.000000
ENSMUSG00000000031,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.821838,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ENSMUSG00000000037,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ERCC-00164,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERCC-00165,0.0,0.000000,5.440952,2.097611,0.000000,0.000000,3.385431,3.339137,0.000000,5.599318,...,6.193378,6.386294,2.114367,6.742680,0.000000,5.936402,0.000000,0.000000,6.265287,0.000000
ERCC-00168,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
ERCC-00170,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Filter for genes and cells which match filtering criteria. Call `.shape` to check what was filtered

In [20]:
df_expression_filt_mask = df_ltpm_combined_dataonly > min_ltpm_exp
df_ltpm_combined_dataonly_genefilt = df_ltpm_combined_dataonly[df_expression_filt_mask.sum(axis=1) > min_num_cells_for_gene_exp]
df_ltpm_combined_dataonly_genefilt.shape

(19248, 782)

In [21]:
df_ltpm_combined_dataonly_genecellfilt = df_ltpm_combined_dataonly_genefilt \
    .T[(df_ltpm_combined_dataonly_genefilt > min_ltpm_exp).sum(axis=0) > min_num_genes_in_cell_exp]
df_ltpm_combined_dataonly_genecellfilt = df_ltpm_combined_dataonly_genecellfilt.T
df_ltpm_combined_dataonly_genecellfilt.shape

(19248, 771)

## Write data to file

Get the column and row names as a list

In [22]:
df_column_names = pd.DataFrame(list(df_ltpm_combined_dataonly_genecellfilt.columns.values))
df_row_names = pd.DataFrame(list(df_ltpm_combined_dataonly_genecellfilt.index.values))

print(df_column_names.shape)
print(df_row_names.shape)

(771, 1)
(19248, 1)


Write the data to file

In [23]:
df_ltpm_combined_dataonly_genecellfilt.to_csv(data_path + '/tpm_combined.csv', index=False, header=False)
df_column_names.to_csv(data_path + '/tpm_combined_cols.csv', index=False, header=False)
df_row_names.to_csv(data_path + '/tpm_combined_rows.csv', index=False, header=False)