Global Library/Module Importations

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Importation

In [2]:
df = pd.read_csv('data/GDSC/Preliminary_Matrix.csv', index_col = 'Unnamed: 0')
df.head()

Unnamed: 0,SZT2_mut,TCHH_mut,HRNR_mut,FLG2_mut,BAZ2B_mut,SCN9A_mut,ANK2_mut,RP1L1_mut,ABCA2_mut,EP400_mut,...,SERPINB7_del,TNFRSF11A_del,SERPINB12_del,SERPINB11_del,CTDP1_del,DSEL_del,SOCS6_del,VPS4B_del,ZNF516_del,Response
22RV1_PROSTATE,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A673_BONE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ALLSIL_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CORL23_LUNG,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOV13_OVARY,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Split the dataframe into train test split. Make sure to keep random state consistently at 32 for repeatability. 

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
X = df.drop("Response", axis = 1)
y = df["Response"].astype(str)

Check for missing values

In [5]:
y[y == 'nan']

RPMI8402_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE    nan
Name: Response, dtype: object

In [6]:
print("Total number of missing values in X: ", X.isnull().sum().sum())

Total number of missing values in X:  0


Drop the sample 'RPMI8402_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE' from both X and y

In [7]:
X = X.drop('RPMI8402_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', axis = 0)
y_new = y.drop('RPMI8402_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE', axis = 0)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.3, random_state=32)


# Feature Selection

#### Recursive Feature Elimination

##### SMV 

In [30]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import LinearSVC
from sklearn import metrics



# create a base classifier used to evaluate a subset of attributes
svm = LinearSVC(max_iter = 10000)

svm_features_included = []
svm_scores = []

for i in range(2):#range(1, len(X.columns), 3) :

    if i == 0:
        ##### WITHOUT DATA LEAKAGE
        print("Without data leakage:\n")
        X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.3, random_state=32)

        # keep the top n features
        svm_rfe = RFE(svm, 80)
        svm_rfe = svm_rfe.fit(X_train, y_train)

        # update X dataframe to include only selected features
        X_train = pd.DataFrame(svm_rfe.transform(X_train))
        X_test = pd.DataFrame(svm_rfe.transform(X_test))   
    
    elif i == 1:
    ##### WITH DATA LEAKAGE
        print("With data leakage:\n")    
        # keep the top n features
        svm_rfe = RFE(svm, 80)
        svm_rfe = svm_rfe.fit(X, y_new)

        # update X dataframe to include only selected features
        X_red = pd.DataFrame(svm_rfe.transform(X))

        # split into training and test
        X_train, X_test, y_train, y_test = train_test_split(X_red, y_new, test_size=0.3, random_state=32)

    
    # run regression
    svm.fit(X_train, y_train)

    svm_train_acc = svm.score(X_train, y_train)
    svm_test_acc = svm.score(X_test, y_test)
   
    svm_features_included.append(X.iloc[:, svm_rfe.support_].columns.values)
    svm_scores.append(svm_test_acc)

    print('Features used:', len(X_train.columns.values))
    
    print('SVM: The training accuracy is:', svm_train_acc)
    print('SVM: The testing accuracy is:', svm_test_acc)
    print("______________________________________\n")

Without data leakage:

Features used: 80
SVM: The training accuracy is: 1.0
SVM: The testing accuracy is: 0.4057971014492754
______________________________________

With data leakage:

Features used: 80
SVM: The training accuracy is: 1.0
SVM: The testing accuracy is: 0.8695652173913043
______________________________________



In [19]:
svm_features_included

[array(['FLG2_mut', 'DNAH2_mut', 'DNAH14_mut'], dtype=object),
 array(['FLG2_mut', 'GPR179_mut', 'INTS1_mut', 'DSCAM_mut', 'OTOGL_mut',
        'KALRN_mut'], dtype=object),
 array(['FLG2_mut', 'DNAH2_mut', 'DNAH14_mut', 'SYNE2_mut', 'GPR179_mut',
        'INTS1_mut', 'OTOGL_mut', 'KALRN_mut', 'TNR_mut'], dtype=object),
 array(['FLG2_mut', 'CACNA1B_mut', 'DNAH2_mut', 'PAPPA2_mut', 'ZNF638_mut',
        'DNAH14_mut', 'SYNE2_mut', 'GPR179_mut', 'INTS1_mut', 'OTOGL_mut',
        'KALRN_mut', 'TNR_mut'], dtype=object),
 array(['FLG2_mut', 'CACNA1B_mut', 'TP53_mut', 'DNAH2_mut', 'PAPPA2_mut',
        'ZNF638_mut', 'DNAH14_mut', 'SYNE2_mut', 'GPR179_mut', 'INTS1_mut',
        'RANBP2_mut', 'OTOGL_mut', 'KALRN_mut', 'TNR_mut', 'RTTN_del'],
       dtype=object),
 array(['FLG2_mut', 'TTN_mut', 'CACNA1B_mut', 'TP53_mut', 'DNAH2_mut',
        'PAPPA2_mut', 'ZNF638_mut', 'DNAH14_mut', 'SYNE2_mut',
        'GPR179_mut', 'INTS1_mut', 'RANBP2_mut', 'DSCAM_mut', 'OTOGL_mut',
        'KALRN_mut', 'TNR_m

In [None]:
sns.

In [11]:
max_score_svm = max(svm_scores)

for i in range(len(svm_scores)) :
    
    if svm_scores[i] == max_score_svm :
        
        feat = svm_features_included[i] 
        
        print('The maximum test accuracy is:', max_score_svm)
        print('The number of features included is:', len(feat))
        print('The included features are: \n', feat)


The maximum test accuracy is: 0.8985507246376812
The number of features included is: 114
The included features are: 
 ['FLG2_mut' 'TNRC6B_mut' 'RERE_mut' 'PDE4DIP_mut' 'NEB_mut' 'TTN_mut'
 'DOCK2_mut' 'DSP_mut' 'PCLO_mut' 'RELN_mut' 'BRAF_mut' 'KMT2C_mut'
 'CSMD3_mut' 'PLEC_mut' 'CACNA1B_mut' 'CDH23_mut' 'MKI67_mut' 'AHNAK_mut'
 'MYO7A_mut' 'KMT2D_mut' 'MYH6_mut' 'HERC2_mut' 'FBN1_mut' 'PKD1_mut'
 'CREBBP_mut' 'ANKRD11_mut' 'TP53_mut' 'DNAH2_mut' 'EVPL_mut' 'RNF213_mut'
 'LAMA1_mut' 'PTPRS_mut' 'FBN3_mut' 'MUC16_mut' 'MYO18B_mut' 'PAPPA2_mut'
 'KIAA1549_mut' 'PXDNL_mut' 'TENM4_mut' 'ZNF638_mut' 'ALMS1_mut'
 'DNAH7_mut' 'LRBA_mut' 'ADGRV1_mut' 'LRRK2_mut' 'HMCN1_mut' 'DNAH14_mut'
 'BPTF_mut' 'IGFN1_mut' 'KIF26B_mut' 'MAP2_mut' 'FRAS1_mut' 'FBN2_mut'
 'NAV3_mut' 'KIF26A_mut' 'CNTNAP4_mut' 'ZNF831_mut' 'VPS13B_mut'
 'CSMD1_mut' 'SYNE2_mut' 'CFAP65_mut' 'FSIP2_mut' 'FREM1_mut' 'CNTRL_mut'
 'ZFHX3_mut' 'BOD1L1_mut' 'ABCA13_mut' 'COL22A1_mut' 'GPR179_mut'
 'CNTNAP5_mut' 'INTS1_mut' 'RANBP2_m

The highest training accuracy is 100% with a corresponding test accuracy of 90% when 93 features are used.
From here, we will look at which features are retained to produce this accuracy.

The following code creates a new dataframe from the original but with only the features selected from above

In [12]:
svm_features = df.loc[:, feat]

In [13]:
svm_features.shape

(231, 120)

##### Logistic Regression

In [18]:
# Recursive Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# create a base classifier used to evaluate a subset of attributes
lr = LogisticRegression(solver = 'lbfgs', multi_class = 'auto', max_iter = 1000)

log_features_included = []
log_scores = []


## for i in range(len(columns)):
for i in range(1,len(X.columns),3) :

# keep the top n features
    log_rfe = RFE(lr, i)
    log_rfe = log_rfe.fit(X, y_new)
    

##update X dataframe to include only selected features
    X_train, X_test, y_train, y_test = train_test_split(X.iloc[:, log_rfe.support_], y_new, test_size=0.3, random_state=32)


## run regression
    lr.fit(X_train, y_train) 

    log_train_acc = lr.score(X_train, y_train)
    log_test_acc = lr.score(X_test, y_test)
    
    log_features_included.append(X.iloc[:, log_rfe.support_].columns.values)
    log_scores.append(log_test_acc)
    
    print('Features used:', len(X_train.columns.values))
    print('LOG: The training accuracy is:', log_train_acc)
    print('LOG: The testing accuracy is:', log_test_acc)


Features used: 1
LOG: The training accuracy is: 0.5590062111801242
LOG: The testing accuracy is: 0.6231884057971014
Features used: 4
LOG: The training accuracy is: 0.6459627329192547
LOG: The testing accuracy is: 0.5362318840579711
Features used: 7
LOG: The training accuracy is: 0.6708074534161491
LOG: The testing accuracy is: 0.5942028985507246
Features used: 10
LOG: The training accuracy is: 0.6521739130434783
LOG: The testing accuracy is: 0.6521739130434783
Features used: 13
LOG: The training accuracy is: 0.6832298136645962
LOG: The testing accuracy is: 0.6956521739130435
Features used: 16
LOG: The training accuracy is: 0.7453416149068323
LOG: The testing accuracy is: 0.6086956521739131
Features used: 19
LOG: The training accuracy is: 0.8012422360248447
LOG: The testing accuracy is: 0.6376811594202898
Features used: 22
LOG: The training accuracy is: 0.7950310559006211
LOG: The testing accuracy is: 0.6811594202898551
Features used: 25
LOG: The training accuracy is: 0.8012422360248447

Features used: 220
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6811594202898551
Features used: 223
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6956521739130435
Features used: 226
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6956521739130435
Features used: 229
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6811594202898551
Features used: 232
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6376811594202898
Features used: 235
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6376811594202898
Features used: 238
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6231884057971014
Features used: 241
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6086956521739131
Features used: 244
LOG: The training accuracy is: 1.0
LOG: The testing accuracy is: 0.6086956521739131
Features used: 247
LOG: The training accuracy is: 1.0
LOG: The testing ac

In [15]:
max_score = max(log_scores)

for i in range(len(log_scores)) :
    
    if log_scores[i] == max_score :
        
        feat = log_features_included[i] 
        
        print('The maximum test accuracy is:', max_score)
        print('The number of features included is:', len(feat))
        print('The included features are: \n', feat)


The maximum test accuracy is: 0.7971014492753623
The number of features included is: 159
The included features are: 
 ['TCHH_mut' 'FLG2_mut' 'SCN9A_mut' 'ANK2_mut' 'TNRC6B_mut' 'RERE_mut'
 'SPEN_mut' 'COL11A1_mut' 'PDE4DIP_mut' 'TTN_mut' 'PIK3CA_mut' 'DNAH5_mut'
 'DOCK2_mut' 'DSP_mut' 'LAMA2_mut' 'BRAF_mut' 'KMT2C_mut' 'CHD7_mut'
 'CSMD3_mut' 'SCRIB_mut' 'PLEC_mut' 'NOTCH1_mut' 'CUBN_mut' 'CDH23_mut'
 'MKI67_mut' 'MYO7A_mut' 'KMT2D_mut' 'ANO4_mut' 'ACACB_mut' 'BRCA2_mut'
 'MYH6_mut' 'HERC2_mut' 'FBN1_mut' 'PKD1_mut' 'CREBBP_mut' 'ANKRD11_mut'
 'TP53_mut' 'DNAH2_mut' 'EVPL_mut' 'LAMA1_mut' 'PTPRS_mut' 'FBN3_mut'
 'MUC16_mut' 'CHD6_mut' 'LAMA5_mut' 'COL20A1_mut' 'MYO18B_mut'
 'MT-CO3_mut' 'PAPPA2_mut' 'KIAA1549_mut' 'PXDNL_mut' 'UBR5_mut'
 'TENM4_mut' 'MT-ATP6_mut' 'ZNF638_mut' 'ALMS1_mut' 'XIRP2_mut'
 'DNAH7_mut' 'LRBA_mut' 'VCAN_mut' 'ADGRV1_mut' 'SDK1_mut' 'TG_mut'
 'VPS13A_mut' 'DNHD1_mut' 'LRRK2_mut' 'MACF1_mut' 'HMCN1_mut' 'DNAH14_mut'
 'THSD7B_mut' 'STARD9_mut' 'TLN2_mut' 'BPTF_mu

In [16]:
log_features = df.loc[:, feat]

In [17]:
log_features.shape

(231, 159)

#### Regularization

##### Lasso 