In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

Read in count data

In [2]:
df = pd.read_csv("merged.txt", sep="\t", index_col="Geneid")

In [3]:
df

Unnamed: 0_level_0,Chr,Start,End,Strand,Length,GM0h,GM30min,GM1h,GM2h,GM4h,GM6h,GM9h,GM12h,GM18h,GM24h,GM48h,GM72h
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
DDX11L1,chr1;chr1;chr1,11874;12613;13221,12227;12721;14409,+;+;+,1652,60,44,86,40,39,56,35,24,31,49,58,110
WASH7P,chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;chr1;c...,14362;14970;15796;16607;16858;17233;17606;1791...,14829;15038;15947;16765;17055;17368;17742;1806...,-;-;-;-;-;-;-;-;-;-;-,1769,40,26,66,41,24,52,27,35,27,35,46,71
MIR6859-3,chr1;chr15;chr16,17369;102513727;67052,17436;102513794;67119,-;+;-,204,0,0,0,0,0,0,0,0,0,0,0,0
MIR6859-2,chr1;chr15;chr16,17369;102513727;67052,17436;102513794;67119,-;+;-,204,0,0,0,0,0,0,0,0,0,0,0,0
MIR6859-4,chr1;chr15;chr16,17369;102513727;67052,17436;102513794;67119,-;+;-,204,0,0,0,0,0,0,0,0,0,0,0,0
MIR6859-1,chr1;chr15;chr16,17369;102513727;67052,17436;102513794;67119,-;+;-,204,0,0,0,0,0,0,0,0,0,0,0,0
MIR1302-11,chr1;chr15;chr19;chr9,30366;102500662;71973;30144,30503;102500799;72110;30281,+;-;+;+,552,0,0,0,0,0,0,0,0,0,0,0,0
MIR1302-9,chr1;chr15;chr19;chr9,30366;102500662;71973;30144,30503;102500799;72110;30281,+;-;+;+,552,0,0,0,0,0,0,0,0,0,0,0,0
MIR1302-2,chr1;chr15;chr19;chr9,30366;102500662;71973;30144,30503;102500799;72110;30281,+;-;+;+,552,0,0,0,0,0,0,0,0,0,0,0,0
MIR1302-10,chr1;chr15;chr19;chr9,30366;102500662;71973;30144,30503;102500799;72110;30281,+;-;+;+,552,0,0,0,0,0,0,0,0,0,0,0,0


Get on the raw counts

In [4]:
dfCounts = df.filter(regex="\d", axis=1)

Get the lengths of each gene

In [5]:
dfLength = df["Length"]

Calculate reads per kilobase by taking
```
counts / length
```
https://www.rna-seqblog.com/rpkm-fpkm-and-tpm-clearly-explained/

In [6]:
rpk = dfCounts.divide(dfLength, axis='index')

Count up all the RPK values in a sample and divide this number by 1,000,000. This is your “per million” scaling factor.

In [7]:
rpkmFactor = rpk.sum(axis=0) / 1e6

Divide the RPK values by the “per million” scaling factor. This gives you TPM.

In [9]:
dftpm = rpk / rpkmFactor

Filter out all the zeros

In [10]:
dftpm = dftpm[(dftpm.T != 0).any()]

In [11]:
dftpm.head

<bound method NDFrame.head of                    GM0h    GM30min       GM1h       GM2h       GM4h  \
Geneid                                                                
DDX11L1       19.029140  13.815192  13.875658  11.614311  13.417125   
WASH7P        11.847047   7.623595   9.944462  11.117305   7.710602   
OR4F5          0.000000   0.000000   0.000000   0.000000   0.000000   
LOC729737      3.062832   1.610862   1.704231   1.664916   2.284140   
LOC100133331   0.694821   0.404632   0.602989   0.523863   0.886708   
...                 ...        ...        ...        ...        ...   
TMSB4Y         0.000000   0.000000   0.000000   0.000000   0.000000   
NLGN4Y-AS1     0.000000   0.000000   0.000000   0.000000   0.000000   
CD24           3.081975   1.220465   1.149788   3.009701   2.897397   
LOC101929148   0.000000   0.000000   0.000000   0.000000   0.000000   
CSPG4P1Y       0.000000   0.326225   0.167636   0.301680   0.000000   

                   GM6h       GM9h      GM12h 

In [12]:
dftpm[dftpm > 0].dropna()

Unnamed: 0_level_0,GM0h,GM30min,GM1h,GM2h,GM4h,GM6h,GM9h,GM12h,GM18h,GM24h,GM48h,GM72h
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
DDX11L1,19.029140,13.815192,13.875658,11.614311,13.417125,12.292416,14.442639,8.687649,11.372491,13.710919,15.754171,17.221868
WASH7P,11.847047,7.623595,9.944462,11.117305,7.710602,10.659449,10.404579,11.831540,9.249961,9.145780,11.668301,10.380736
LOC729737,3.062832,1.610862,1.704231,1.664916,2.284140,1.987356,1.618926,1.529411,1.439271,1.604462,2.869068,2.315202
LOC100133331,0.694821,0.404632,0.602989,0.523863,0.886708,0.735493,0.638140,0.746392,0.614601,0.504841,0.910116,0.948290
MIR6723,1383.425637,915.006020,937.388271,905.446467,823.767415,1059.357624,1861.250533,1686.493989,1123.564219,1173.813123,1068.866508,915.415279
LOC100288069,26.750909,45.021899,30.717426,41.984337,36.479605,43.642110,42.263946,45.362499,43.320413,49.900488,79.532830,59.613859
FAM87B,5.390285,5.603216,3.976188,3.947910,5.554721,3.730723,5.259974,8.305554,8.417274,8.084687,7.848023,12.107173
LINC00115,43.760762,45.292508,31.369754,36.057277,47.037651,31.113719,47.620135,32.238413,24.849173,42.820781,44.633725,53.613540
LINC01128,12.654354,11.011020,8.718249,11.741109,7.755907,7.737179,8.195373,8.290261,8.926887,8.661314,11.518160,14.650635
SAMD11,26.258326,30.057657,18.680880,20.283662,21.362656,27.970781,25.356615,28.097096,16.610439,22.624013,20.204800,17.317007
