# importing necessary libraries.

In [167]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier


# obtaining the data.

In [140]:
warnings.filterwarnings('ignore')
df=pd.read_csv('clinvar_conflicting.csv')
df.head()

Unnamed: 0,CHROM,POS,REF,ALT,AF_ESP,AF_EXAC,AF_TGP,CLNDISDB,CLNDISDBINCL,CLNDN,...,SIFT,PolyPhen,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
0,1,1168180,G,C,0.0771,0.1002,0.1066,MedGen:CN169374,,not_specified,...,tolerated,benign,,,,,,1.053,-0.208682,2.0
1,1,1470752,G,A,0.0,0.0,0.0,"MedGen:C1843891,OMIM:607454,Orphanet:ORPHA9877...",,Spinocerebellar_ataxia_21|not_provided,...,deleterious_low_confidence,benign,,,,,,31.0,6.517838,-3.0
2,1,1737942,A,G,0.0,1e-05,0.0,"Human_Phenotype_Ontology:HP:0000486,MedGen:C00...",,Strabismus|Nystagmus|Hypothyroidism|Intellectu...,...,deleterious,probably_damaging,,,,,,28.1,6.061752,-1.0
3,1,2160305,G,A,0.0,0.0,0.0,"MedGen:C1321551,OMIM:182212,SNOMED_CT:83092002...",,Shprintzen-Goldberg_syndrome|not_provided,...,,,,,,,,22.5,3.114491,
4,1,2160305,G,T,0.0,0.0,0.0,"MedGen:C1321551,OMIM:182212,SNOMED_CT:83092002",,Shprintzen-Goldberg_syndrome,...,,,,,,,,24.7,4.766224,-3.0


# data exploration.

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65188 entries, 0 to 65187
Data columns (total 46 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CHROM               65188 non-null  object 
 1   POS                 65188 non-null  int64  
 2   REF                 65188 non-null  object 
 3   ALT                 65188 non-null  object 
 4   AF_ESP              65188 non-null  float64
 5   AF_EXAC             65188 non-null  float64
 6   AF_TGP              65188 non-null  float64
 7   CLNDISDB            65188 non-null  object 
 8   CLNDISDBINCL        167 non-null    object 
 9   CLNDN               65188 non-null  object 
 10  CLNDNINCL           167 non-null    object 
 11  CLNHGVS             65188 non-null  object 
 12  CLNSIGINCL          167 non-null    object 
 13  CLNVC               65188 non-null  object 
 14  CLNVI               27659 non-null  object 
 15  MC                  64342 non-null  object 
 16  ORIG

In [142]:
display(df.corr(numeric_only=True).head())

Unnamed: 0,POS,AF_ESP,AF_EXAC,AF_TGP,ORIGIN,SSR,CLASS,DISTANCE,STRAND,MOTIF_POS,MOTIF_SCORE_CHANGE,LoFtool,CADD_PHRED,CADD_RAW,BLOSUM62
POS,1.0,0.007003,0.007721,0.004225,0.013645,0.192398,0.012061,0.197942,-0.122896,,-1.0,0.257226,-0.005409,-0.009753,0.022451
AF_ESP,0.007003,1.0,0.85187,0.807741,-0.013563,-0.077253,-0.126538,0.295624,-0.011551,,,0.027343,-0.164169,-0.148118,0.039367
AF_EXAC,0.007721,0.85187,1.0,0.805634,-0.013432,0.048413,-0.121213,0.29184,-0.013358,,,0.027831,-0.154891,-0.140803,0.038996
AF_TGP,0.004225,0.807741,0.805634,1.0,-0.01445,-0.114526,-0.130077,0.040676,-0.014002,,,0.029447,-0.167057,-0.150989,0.043167
ORIGIN,0.013645,-0.013563,-0.013432,-0.01445,1.0,0.175675,0.004357,,-0.007564,,,-0.028363,0.06029,0.064609,-0.012681


# dropping the useless columns

In [143]:
df.drop(['CLNDISDBINCL','CLNDNINCL','CLNSIGINCL','CLNVI','SSR','INTRON','DISTANCE','BAM_EDIT','SIFT','PolyPhen','MOTIF_NAME','MOTIF_POS','HIGH_INF_POS','MOTIF_SCORE_CHANGE','BLOSUM62'],axis=1,inplace=True)

In [144]:
df.isnull().sum()

CHROM                   0
POS                     0
REF                     0
ALT                     0
AF_ESP                  0
AF_EXAC                 0
AF_TGP                  0
CLNDISDB                0
CLNDN                   0
CLNHGVS                 0
CLNVC                   0
MC                    846
ORIGIN                  0
CLASS                   0
Allele                  0
Consequence             0
IMPACT                  0
SYMBOL                 16
Feature_type           14
Feature                14
BIOTYPE                16
EXON                 8893
cDNA_position        8884
CDS_position         9955
Protein_position     9955
Amino_acids         10004
Codons              10004
STRAND                 14
LoFtool              4213
CADD_PHRED           1092
CADD_RAW             1092
dtype: int64

In [145]:
df['STRAND'].mode()

0   -1.0
Name: STRAND, dtype: float64

In [146]:
df['LoFtool'].fillna(int(df['LoFtool'].mean()), inplace=True)
df['CADD_PHRED'].fillna(int(df['CADD_PHRED'].mean()), inplace=True)
df['CADD_RAW'].fillna(int(df['CADD_RAW'].mean()), inplace=True)

In [147]:
df['STRAND'].fillna(df['STRAND'].mode()[0],inplace=True)

In [148]:
df.isnull().sum()

CHROM                   0
POS                     0
REF                     0
ALT                     0
AF_ESP                  0
AF_EXAC                 0
AF_TGP                  0
CLNDISDB                0
CLNDN                   0
CLNHGVS                 0
CLNVC                   0
MC                    846
ORIGIN                  0
CLASS                   0
Allele                  0
Consequence             0
IMPACT                  0
SYMBOL                 16
Feature_type           14
Feature                14
BIOTYPE                16
EXON                 8893
cDNA_position        8884
CDS_position         9955
Protein_position     9955
Amino_acids         10004
Codons              10004
STRAND                  0
LoFtool                 0
CADD_PHRED              0
CADD_RAW                0
dtype: int64

In [149]:
object_columns = df.select_dtypes(['object']).columns
object_columns
for c in df.columns:
    if c in object_columns:
        df[[c]]=df[[c]].astype('category')


In [150]:
cat_columns = df.select_dtypes(['category']).columns
print(cat_columns)
df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
cat_columns

Index(['CHROM', 'REF', 'ALT', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVC', 'MC',
       'Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Feature_type', 'Feature',
       'BIOTYPE', 'EXON', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons'],
      dtype='object')


Index(['CHROM', 'REF', 'ALT', 'CLNDISDB', 'CLNDN', 'CLNHGVS', 'CLNVC', 'MC',
       'Allele', 'Consequence', 'IMPACT', 'SYMBOL', 'Feature_type', 'Feature',
       'BIOTYPE', 'EXON', 'cDNA_position', 'CDS_position', 'Protein_position',
       'Amino_acids', 'Codons'],
      dtype='object')

In [151]:
df.dtypes

CHROM                  int8
POS                   int64
REF                   int16
ALT                   int16
AF_ESP              float64
AF_EXAC             float64
AF_TGP              float64
CLNDISDB              int16
CLNDN                 int16
CLNHGVS               int32
CLNVC                  int8
MC                     int8
ORIGIN                int64
CLASS                 int64
Allele                int16
Consequence            int8
IMPACT                 int8
SYMBOL                int16
Feature_type           int8
Feature               int16
BIOTYPE                int8
EXON                  int16
cDNA_position         int16
CDS_position          int16
Protein_position      int16
Amino_acids           int16
Codons                int16
STRAND              float64
LoFtool             float64
CADD_PHRED          float64
CADD_RAW            float64
dtype: object

In [152]:
df.corr()['CLASS']

CHROM               0.002030
POS                 0.012061
REF                 0.000586
ALT                -0.000748
AF_ESP             -0.126538
AF_EXAC            -0.121213
AF_TGP             -0.130077
CLNDISDB            0.012407
CLNDN               0.023176
CLNHGVS            -0.013249
CLNVC               0.037792
MC                  0.005494
ORIGIN              0.004357
CLASS               1.000000
Allele              0.007442
Consequence        -0.001063
IMPACT              0.068912
SYMBOL              0.021568
Feature_type       -0.014732
Feature             0.001521
BIOTYPE            -0.013692
EXON               -0.012167
cDNA_position      -0.011147
CDS_position       -0.013195
Protein_position   -0.007403
Amino_acids        -0.017456
Codons             -0.018030
STRAND             -0.031551
LoFtool            -0.002226
CADD_PHRED         -0.037178
CADD_RAW           -0.052208
Name: CLASS, dtype: float64

In [153]:
a=['CLNDN','CLNVC','IMPACT','SYMBOL']#list of columns having negative values but may be relevant , interpreted from correlation

In [154]:
(df['SYMBOL'] < 0).sum()

16

In [155]:
# selecting only those columns which are positive As chi square cant handle negative values
li=df.columns[(df>=0).all()].tolist()
li
df_positive=df[li]

# feature selection

In [156]:
# using chi square for feature selection
data=df_positive.copy()

X = data.loc[:,data.columns!='CLASS']
y = data.loc[:,'CLASS']
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']
print(featureScores.nlargest(8,'Score'))#printing the 8 best features

         Specs         Score
1          POS  4.362602e+08
9      CLNHGVS  1.243176e+05
8        CLNDN  5.431848e+04
7     CLNDISDB  1.306732e+04
16  CADD_PHRED  6.637613e+02
12      Allele  3.907656e+02
6       AF_TGP  2.560591e+02
4       AF_ESP  2.402746e+02


In [157]:
X = data[featureScores.sort_values(by='Score', ascending=False).iloc[:5,0]]
X=pd.concat([X,df[a]],ignore_index=True,sort=False,axis=1)
y = data['CLASS']
X

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,1168180,326,8960,8866,1.053,8960,6,2,200
1,1470752,467,8388,6615,31.000,8388,6,2,2135
2,1737942,1179,8449,750,28.100,8449,6,2,892
3,2160305,1826,8156,5993,22.500,8156,6,2,1889
4,2160305,1827,8154,5989,24.700,8154,6,2,1889
...,...,...,...,...,...,...,...,...,...
65183,154158201,64090,3915,3195,0.105,3915,6,1,707
65184,154159118,64093,9085,8932,0.002,9085,6,2,707
65185,154194886,64096,9085,8932,12.850,9085,6,1,707
65186,154490187,64097,6683,8221,0.130,6683,6,1,1711


In [172]:
# scaling the data
scaler = StandardScaler()
XScaled = scaler.fit_transform(X)
accs = []
skf = StratifiedKFold(n_splits=10,random_state=1, shuffle=True)

In [169]:
def logisticRegression(X,y):
    for train_index, test_index in skf.split(X,y):
        X_train = X.iloc[train_index]
        y_train = y[train_index]
        X_test = X.iloc[test_index]
        y_test = y[test_index]
        LRModel = LogisticRegression(penalty = 'l2',max_iter=500)
        LRModel.fit(X_train, y_train)
        Y_testPred = LRModel.predict(X_test)
        testAccuracy = accuracy_score(y_test, Y_testPred)
        accs.append(testAccuracy*100)
    print("Test Accuracy", testAccuracy*100)
    print(accs)
    print("AVG: ",sum(accs)/len(accs))

In [170]:

# using logistic regression model.
logisticRegression(X,y)

Test Accuracy 74.79674796747967
Test Accuracy 74.79674796747967
Test Accuracy 74.79674796747967
Test Accuracy 74.79674796747967
Test Accuracy 74.7814081914404
Test Accuracy 74.7814081914404
Test Accuracy 74.7814081914404
Test Accuracy 74.7814081914404
Test Accuracy 74.79288125191776
Test Accuracy 74.79288125191776
[74.79674796747967, 74.79674796747967, 74.79674796747967, 74.79674796747967, 74.7814081914404, 74.7814081914404, 74.7814081914404, 74.7814081914404, 74.79288125191776, 74.79288125191776]
AVG:  74.78983871395158
