<h2>tSNE algorithm on clinvar dataset</h2>

<p>- Load the filtered/clean data </p>
<p>- Extract Classes column in variable 'Y'</p>
<p>- Drop first column which is meaningless</p>
<p>- Encode the categorical string attributed to numerical (for Column: ANN[0].EFFECT)</p>
<p>- Fill NA's with mean of each column</p>
<p>- Normalize data</p>

In [16]:
import time
from pathlib import Path
import pandas as pd
from numpy import nan

# Specify data path where datasets are placed
data_dir = Path('./data')

# Read dataset
data = pd.read_csv(data_dir / 'clinvar_filtered',sep = "\t")

In [17]:
# Extract classes
Y = data['CLNSIG']
# Drop first column which is meaningless
X_raw = data.drop(data.columns[0], axis=1)
# Drop classes column
X_raw = X_raw.drop(data.columns[1], axis=1)
# Convert all entries which are not Pathogenic to Non-Pathogenic
Y.loc[Y != "Pathogenic"] = "Non-Pathogenic"
# Assign Pathogenic=1 and Non-Pathogenic=0
Y = Y.replace(to_replace=['Pathogenic', 'Non-Pathogenic'], value=[1, 0])
Y.unique()

array([1, 0])

In [18]:
# LabelEncoder to convert String Attributes to Numeric for column 'ANN[0].EFFECT'
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(X_raw['ANN[0].EFFECT'])
X_raw['ANN[0].EFFECT'] = le.transform(X_raw['ANN[0].EFFECT'])
#len(X_raw['ANN[0].EFFECT'].unique())

In [19]:
X_raw.head(5)

Unnamed: 0,ANN[0].EFFECT,dbNSFP_Polyphen2_HVAR_score,dbNSFP_Polyphen2_HDIV_score,dbNSFP_LRT_score,CADD,dbNSFP_phastCons20way_mammalian,dbNSFP_LRT_Omega,dbNSFP_MutationTaster_score,dbNSFP_VEST3_score,dbNSFP_DANN_score,dbNSFP_MetaSVM_score,dbNSFP_MetaLR_score,dbNSFP_integrated_fitCons_score,dbNSFP_GM12878_fitCons_score,Eigenraw,gnomAD_AF
0,121,,,0.351746,36.0,0.155,1.36847,1.0,,0.996902,,,0.72623,0.52208,0.337459,
1,25,,,,,,,,,,,,,,,
2,121,,,0.035538,35.0,0.004,2.42205,1.0,,0.978008,,,0.72623,0.52208,-0.234428,8e-06
3,141,,,,8.15,,,,,,,,,,,0.3437
4,65,0.006,0.004,0.72691,3.288,0.464,0.737577,1.0,0.159,0.434743,-0.9797,0.184,0.441713,0.218748,-1.109901,0.01214


In [None]:
Y.head(5)

In [20]:
# Fill NA's with Mean of the column
X_raw['dbNSFP_Polyphen2_HVAR_score'].fillna(value=X_raw['dbNSFP_Polyphen2_HVAR_score'].mean(),inplace=True)
X_raw['dbNSFP_Polyphen2_HDIV_score'].fillna(value=X_raw['dbNSFP_Polyphen2_HDIV_score'].mean(),inplace=True)
X_raw['dbNSFP_LRT_score'].fillna(value=X_raw['dbNSFP_LRT_score'].mean(),inplace=True)
X_raw['CADD'].fillna(value=X_raw['CADD'].mean(),inplace=True)
X_raw['dbNSFP_phastCons20way_mammalian'].fillna(value=X_raw['dbNSFP_phastCons20way_mammalian'].mean(),inplace=True)
X_raw['dbNSFP_LRT_Omega'].fillna(value=X_raw['dbNSFP_LRT_Omega'].mean(),inplace=True)
X_raw['dbNSFP_MutationTaster_score'].fillna(value=X_raw['dbNSFP_MutationTaster_score'].mean(),inplace=True)
X_raw['dbNSFP_VEST3_score'].fillna(value=X_raw['dbNSFP_VEST3_score'].mean(),inplace=True)
X_raw['dbNSFP_DANN_score'].fillna(value=X_raw['dbNSFP_DANN_score'].mean(),inplace=True)
X_raw['dbNSFP_MetaSVM_score'].fillna(value=X_raw['dbNSFP_MetaSVM_score'].mean(),inplace=True)
X_raw['dbNSFP_MetaLR_score'].fillna(value=X_raw['dbNSFP_MetaLR_score'].mean(),inplace=True)
X_raw['dbNSFP_integrated_fitCons_score'].fillna(value=X_raw['dbNSFP_integrated_fitCons_score'].mean(),inplace=True)
X_raw['dbNSFP_GM12878_fitCons_score'].fillna(value=X_raw['dbNSFP_GM12878_fitCons_score'].mean(),inplace=True)
X_raw['Eigenraw'].fillna(value=X_raw['Eigenraw'].mean(),inplace=True)
X_raw['gnomAD_AF'].fillna(value=X_raw['gnomAD_AF'].mean(),inplace=True)

In [21]:
# Normalize data
X = (X_raw - X_raw.min()) / (X_raw.max() - X_raw.min())
X.head(5)

Unnamed: 0,ANN[0].EFFECT,dbNSFP_Polyphen2_HVAR_score,dbNSFP_Polyphen2_HDIV_score,dbNSFP_LRT_score,CADD,dbNSFP_phastCons20way_mammalian,dbNSFP_LRT_Omega,dbNSFP_MutationTaster_score,dbNSFP_VEST3_score,dbNSFP_DANN_score,dbNSFP_MetaSVM_score,dbNSFP_MetaLR_score,dbNSFP_integrated_fitCons_score,dbNSFP_GM12878_fitCons_score,Eigenraw,gnomAD_AF
0,0.852113,0.535111,0.62917,0.351746,0.36363,0.155,0.000176,1.0,0.520653,0.997163,0.458535,0.47437,0.864887,0.544675,0.780901,0.02839
1,0.176056,0.535111,0.62917,0.079251,0.15812,0.767854,0.000328,0.966243,0.520653,0.934826,0.458535,0.47437,0.733945,0.636285,0.745582,0.02839
2,0.852113,0.535111,0.62917,0.035538,0.353529,0.004,0.000311,1.0,0.520653,0.977557,0.458535,0.47437,0.864887,0.544675,0.653679,8e-06
3,0.992958,0.535111,0.62917,0.079251,0.082314,0.767854,0.000328,0.966243,0.520653,0.934826,0.458535,0.47437,0.733945,0.636285,0.745582,0.3437
4,0.457746,0.006,0.004,0.72691,0.033202,0.464,9.5e-05,1.0,0.159,0.413819,0.247277,0.184,0.526048,0.228215,0.458922,0.01214


In [22]:
# Convert Data into Numpy array
X = X.to_numpy()
X = X.copy('C')

In [23]:
# Import TSNE
# Rapidsai version
from cuml.manifold import TSNE

# CPU Version
#from sklearn.manifold import TSNE

tsne = TSNE(n_components=2, random_state=0)

In [None]:
# Fit tSNE on data
t0 = time.time()
X_2d = tsne.fit_transform(X)
t1 = time.time()
total = t1-t0

In [None]:
# How to fix the data
# First step is to select required columns from the data in bash. Below line will select required columns and redirect them to a file called "clinvar_selected"
# awk -F'\t' -vcols=CLNSIG,ANN\[0\]\.EFFECT,dbNSFP_Polyphen2_HVAR_score,dbNSFP_Polyphen2_HDIV_score,dbNSFP_LRT_score,CADD,dbNSFP_phastCons20way_mammalian,dbNSFP_LRT_Omega,dbNSFP_MutationTaster_score,dbNSFP_VEST3_score,dbNSFP_DANN_score,dbNSFP_MetaSVM_score,dbNSFP_MetaLR_score,dbNSFP_integrated_fitCons_score,dbNSFP_GM12878_fitCons_score,Eigenraw,gnomAD_AF '(NR==1){n=split(cols,cs,",");for(c=1;c<=n;c++){for(i=1;i<=NF;i++)if($(i)==cs[c])ci[c]=i}}{for(i=1;i<=n;i++)printf "%s" FS,$(ci[i]);printf "\n"}' clinvar_20170217_temp.tsv > clinvar_selected
# Then we need to apply various filters on the numeric value columns to remove ., etc. and also fill . with NaN
# First two columns will stay untouched since one of them is class column and the other is categorical column
# For this I have writen a script called "clean_data.sh"
# This script will read all the columns and clean the data
# The resultant data is written to a new file called "clinvar_filtered"
# clinvar_filtered is then loaded in the Jupyter Lab to perform various Analytics
'''
#!/bin/bash
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $1}' clinvar_selected) > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $2}' clinvar_selected) > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $3}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $4}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $5}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $6}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $7}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $8}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $9}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $10}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $11}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $12}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $13}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $14}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $15}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $16}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
oldoutput=$(< clinvar_filtered)
paste - <(awk -F " " '{print $17}' clinvar_selected | sed 's/\.\,//g' | sed 's/\,\.//g' | sed 's/^\./NaN/g' | awk -F ',' '{print $1}') > clinvar_filtered <<<"$oldoutput"
'''