# Data Wrangling& Exploration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import manifold
from sklearn.manifold import TSNE
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn.decomposition import PCA

from time import time

# Import dataset

In [2]:
df = pd.read_csv('batchcorrected_log2cpm.tsv', sep='\t',index_col=0)
df.head() #(18053, 453)

Unnamed: 0,100_2,101_3,102_2,103_3,104_2,105_2,106_4,107_4,109_1,11_4,...,90_2,91_2,92_3,93_2,94_4,95_4,96_3,97_2,98_3,99_1
ENSG00000000003,4.224093,4.08811,4.10896,4.338494,4.144095,3.961678,4.068081,4.181222,4.177308,4.345819,...,4.095915,4.185599,3.984862,4.437899,4.258712,3.980472,4.310913,4.078538,4.171295,4.225876
ENSG00000000419,4.273573,4.278318,4.508365,4.53399,4.4516,4.437512,4.229555,4.359195,4.299986,4.569697,...,4.615619,4.413496,4.6231,4.781393,4.534774,4.570218,4.7226,4.492435,4.560072,4.415542
ENSG00000000457,4.716466,4.969749,4.852762,4.947713,4.882771,4.704028,4.675137,4.517319,4.740297,4.6725,...,4.700506,4.839435,4.763086,4.870351,4.87664,4.812464,4.812821,4.840323,4.880159,4.652706
ENSG00000000460,4.12617,4.528228,4.286024,4.31015,4.150902,4.265058,4.737984,4.102466,4.593532,4.613472,...,4.476497,4.568329,4.556887,4.323269,4.165587,4.667403,4.378113,4.334682,4.361387,3.685774
ENSG00000000938,1.171196,0.862429,1.540143,1.105556,1.375205,1.568452,1.638125,1.233508,1.497463,1.548793,...,1.454494,0.874062,1.199031,2.341509,1.732493,1.52306,1.136348,1.578141,0.897523,1.096682


In [3]:
anno = pd.read_csv('AnnotationFile.tsv', sep = '\t', index_col = [0])
anno.head()

Unnamed: 0_level_0,external_gene_name,chromosome_name,start_position,end_position,strand,gene_length,gene_biotype
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000003,TSPAN6,X,100627109,100639991,-1,4535.0,protein_coding
ENSG00000000005,TNMD,X,100584802,100599885,1,1610.0,protein_coding
ENSG00000000419,DPM1,20,50934867,50958555,-1,1207.0,protein_coding
ENSG00000000457,SCYL3,1,169849631,169894267,-1,6883.0,protein_coding
ENSG00000000460,C1orf112,1,169662007,169854080,1,5967.0,protein_coding


# Removing X,Y chromosome

In [4]:
#Removing X&Y chromosome
anno = anno[anno.chromosome_name != 'X']
anno.head()
anno = anno[anno.chromosome_name != 'Y']
anno.head()

Unnamed: 0_level_0,external_gene_name,chromosome_name,start_position,end_position,strand,gene_length,gene_biotype
ensembl_gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENSG00000000419,DPM1,20,50934867,50958555,-1,1207.0,protein_coding
ENSG00000000457,SCYL3,1,169849631,169894267,-1,6883.0,protein_coding
ENSG00000000460,C1orf112,1,169662007,169854080,1,5967.0,protein_coding
ENSG00000000938,FGR,1,27612064,27635277,-1,3474.0,protein_coding
ENSG00000000971,CFH,1,196651878,196747504,1,8145.0,protein_coding


In [5]:
both = df.join(anno)#(18053,460)
both= both.dropna(axis=0,how='any')
both.shape # (17421,460)
both.head()

Unnamed: 0,100_2,101_3,102_2,103_3,104_2,105_2,106_4,107_4,109_1,11_4,...,97_2,98_3,99_1,external_gene_name,chromosome_name,start_position,end_position,strand,gene_length,gene_biotype
ENSG00000000419,4.273573,4.278318,4.508365,4.53399,4.4516,4.437512,4.229555,4.359195,4.299986,4.569697,...,4.492435,4.560072,4.415542,DPM1,20,50934867.0,50958555.0,-1.0,1207.0,protein_coding
ENSG00000000457,4.716466,4.969749,4.852762,4.947713,4.882771,4.704028,4.675137,4.517319,4.740297,4.6725,...,4.840323,4.880159,4.652706,SCYL3,1,169849631.0,169894267.0,-1.0,6883.0,protein_coding
ENSG00000000460,4.12617,4.528228,4.286024,4.31015,4.150902,4.265058,4.737984,4.102466,4.593532,4.613472,...,4.334682,4.361387,3.685774,C1orf112,1,169662007.0,169854080.0,1.0,5967.0,protein_coding
ENSG00000000938,1.171196,0.862429,1.540143,1.105556,1.375205,1.568452,1.638125,1.233508,1.497463,1.548793,...,1.578141,0.897523,1.096682,FGR,1,27612064.0,27635277.0,-1.0,3474.0,protein_coding
ENSG00000000971,4.01548,3.843299,3.846015,3.877232,4.456291,4.251444,4.16741,3.808677,4.057762,4.302872,...,4.215,3.983356,4.045732,CFH,1,196651878.0,196747504.0,1.0,8145.0,protein_coding


In [6]:
cleandf = both.drop(['external_gene_name','chromosome_name','start_position','end_position','strand','gene_length','gene_biotype'],axis = 1)
cleandf.head() #final: (17421, 453) vs original : (18053, 453)

Unnamed: 0,100_2,101_3,102_2,103_3,104_2,105_2,106_4,107_4,109_1,11_4,...,90_2,91_2,92_3,93_2,94_4,95_4,96_3,97_2,98_3,99_1
ENSG00000000419,4.273573,4.278318,4.508365,4.53399,4.4516,4.437512,4.229555,4.359195,4.299986,4.569697,...,4.615619,4.413496,4.6231,4.781393,4.534774,4.570218,4.7226,4.492435,4.560072,4.415542
ENSG00000000457,4.716466,4.969749,4.852762,4.947713,4.882771,4.704028,4.675137,4.517319,4.740297,4.6725,...,4.700506,4.839435,4.763086,4.870351,4.87664,4.812464,4.812821,4.840323,4.880159,4.652706
ENSG00000000460,4.12617,4.528228,4.286024,4.31015,4.150902,4.265058,4.737984,4.102466,4.593532,4.613472,...,4.476497,4.568329,4.556887,4.323269,4.165587,4.667403,4.378113,4.334682,4.361387,3.685774
ENSG00000000938,1.171196,0.862429,1.540143,1.105556,1.375205,1.568452,1.638125,1.233508,1.497463,1.548793,...,1.454494,0.874062,1.199031,2.341509,1.732493,1.52306,1.136348,1.578141,0.897523,1.096682
ENSG00000000971,4.01548,3.843299,3.846015,3.877232,4.456291,4.251444,4.16741,3.808677,4.057762,4.302872,...,3.140015,3.871984,3.694756,3.987447,4.604056,4.457524,3.684325,4.215,3.983356,4.045732


# Prepare for Modeling-using ndf

In [7]:
ndf = cleandf.T
ndf['MGS_LEVEL'] = 0
ndf.head()

Unnamed: 0,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,ENSG00000001461,...,ENSG00000283536,ENSG00000283590,ENSG00000283617,ENSG00000283619,ENSG00000283623,ENSG00000283633,ENSG00000283662,ENSG00000283667,ENSG00000283674,MGS_LEVEL
100_2,4.273573,4.716466,4.12617,1.171196,4.01548,3.293096,5.294673,5.504096,5.377653,6.92251,...,3.068081,1.881444,1.346983,1.513299,0.79777,0.419013,0.843287,1.047956,1.928373,0
101_3,4.278318,4.969749,4.528228,0.862429,3.843299,3.390655,5.527211,5.846664,5.255156,7.03251,...,2.866956,1.795239,2.366877,1.177736,1.070189,1.483662,0.6371,1.125788,1.522671,0
102_2,4.508365,4.852762,4.286024,1.540143,3.846015,3.596859,4.948842,5.788883,5.145699,6.922071,...,2.886454,1.835291,2.017325,1.232213,0.841682,0.010696,0.723624,1.194329,1.378135,0
103_3,4.53399,4.947713,4.31015,1.105556,3.877232,3.448069,5.472893,5.687792,5.140274,6.873689,...,2.891162,1.961135,1.967287,1.947879,0.387205,0.712213,0.256273,1.322104,2.025008,0
104_2,4.4516,4.882771,4.150902,1.375205,4.456291,3.50159,5.159368,5.6467,5.06941,6.842389,...,3.128063,2.011594,1.55607,0.814551,1.100907,1.15577,0.919095,1.381362,1.788005,0


In [8]:
new = pd.DataFrame()
new = ndf.reset_index()
new.head()

Unnamed: 0,index,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,...,ENSG00000283536,ENSG00000283590,ENSG00000283617,ENSG00000283619,ENSG00000283623,ENSG00000283633,ENSG00000283662,ENSG00000283667,ENSG00000283674,MGS_LEVEL
0,100_2,4.273573,4.716466,4.12617,1.171196,4.01548,3.293096,5.294673,5.504096,5.377653,...,3.068081,1.881444,1.346983,1.513299,0.79777,0.419013,0.843287,1.047956,1.928373,0
1,101_3,4.278318,4.969749,4.528228,0.862429,3.843299,3.390655,5.527211,5.846664,5.255156,...,2.866956,1.795239,2.366877,1.177736,1.070189,1.483662,0.6371,1.125788,1.522671,0
2,102_2,4.508365,4.852762,4.286024,1.540143,3.846015,3.596859,4.948842,5.788883,5.145699,...,2.886454,1.835291,2.017325,1.232213,0.841682,0.010696,0.723624,1.194329,1.378135,0
3,103_3,4.53399,4.947713,4.31015,1.105556,3.877232,3.448069,5.472893,5.687792,5.140274,...,2.891162,1.961135,1.967287,1.947879,0.387205,0.712213,0.256273,1.322104,2.025008,0
4,104_2,4.4516,4.882771,4.150902,1.375205,4.456291,3.50159,5.159368,5.6467,5.06941,...,3.128063,2.011594,1.55607,0.814551,1.100907,1.15577,0.919095,1.381362,1.788005,0


In [9]:
mlist = new ['index'].apply(lambda x:x[-1]).tolist()
ndf['MGS_LEVEL'] = mlist
level = ndf.pop('MGS_LEVEL')
ndf.insert(0,'MGS_LEVEL',level)
ndf.head()

Unnamed: 0,MGS_LEVEL,ENSG00000000419,ENSG00000000457,ENSG00000000460,ENSG00000000938,ENSG00000000971,ENSG00000001036,ENSG00000001084,ENSG00000001167,ENSG00000001460,...,ENSG00000283529,ENSG00000283536,ENSG00000283590,ENSG00000283617,ENSG00000283619,ENSG00000283623,ENSG00000283633,ENSG00000283662,ENSG00000283667,ENSG00000283674
100_2,2,4.273573,4.716466,4.12617,1.171196,4.01548,3.293096,5.294673,5.504096,5.377653,...,1.145577,3.068081,1.881444,1.346983,1.513299,0.79777,0.419013,0.843287,1.047956,1.928373
101_3,3,4.278318,4.969749,4.528228,0.862429,3.843299,3.390655,5.527211,5.846664,5.255156,...,1.189183,2.866956,1.795239,2.366877,1.177736,1.070189,1.483662,0.6371,1.125788,1.522671
102_2,2,4.508365,4.852762,4.286024,1.540143,3.846015,3.596859,4.948842,5.788883,5.145699,...,0.915614,2.886454,1.835291,2.017325,1.232213,0.841682,0.010696,0.723624,1.194329,1.378135
103_3,3,4.53399,4.947713,4.31015,1.105556,3.877232,3.448069,5.472893,5.687792,5.140274,...,0.850755,2.891162,1.961135,1.967287,1.947879,0.387205,0.712213,0.256273,1.322104,2.025008
104_2,2,4.4516,4.882771,4.150902,1.375205,4.456291,3.50159,5.159368,5.6467,5.06941,...,0.647376,3.128063,2.011594,1.55607,0.814551,1.100907,1.15577,0.919095,1.381362,1.788005


In [10]:
from sklearn.model_selection import train_test_split

#split dataset into train and test dataset
X, y = ndf.iloc[:,1:], ndf.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

#shape of train and test
X_train.shape, y_train.shape #((407, 17421), (407,))
X_test.shape, y_test.shape #((46, 17421), (46,))

((46, 17421), (46,))