In [55]:
import csv
import numpy as np
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn import metrics
from termcolor import colored 
from sklearn.svm import LinearSVC
#Splitting data
from sklearn.model_selection import train_test_split
#Hyperparameter tuning using GridSearchCV for SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import cross_val_score

In [56]:
dataset= pd.read_csv("breast_cancer_genomic.csv")
dataset

Unnamed: 0,CLID,Class,Hs.2256,Hs.346950,Hs.256697,Hs.434053,Hs.194726,Hs.74624,Hs.1578,Hs.30743,...,Hs.118962,Hs.250822,Hs.82563,Hs.418533,Hs.433416,Hs.436348,Hs.388664,Hs.302690,Hs.293885,Hs.82109
0,Sample 115,0,-0.227,0.093,0.330,-0.672,0.197,3.433,2.044,-1.421,...,-0.813,3.024,0.272,-0.751,-1.259,0.240,-1.170,-0.070,-0.164,-0.680
1,Exp21630,0,-1.482,0.326,0.112,-0.649,-0.727,0.033,1.103,1.782,...,-1.215,0.379,0.376,-0.845,-0.611,-0.270,-1.038,-0.830,0.065,-0.711
2,Exp21626,0,-1.796,-0.346,3.006,-0.195,0.213,-0.628,0.488,1.429,...,-0.069,2.760,1.447,-0.712,0.564,0.730,-0.394,-1.313,-1.117,0.428
3,Exp21611,0,-3.011,-0.373,-0.161,1.019,6.353,0.147,0.330,-0.838,...,-0.071,-0.056,-0.855,-0.356,-0.313,0.306,0.178,0.083,-0.409,-0.167
4,Sample 39 >5 yr survival(132 months) age 46 e...,0,-0.437,3.585,-1.209,0.448,2.649,3.110,-0.695,-0.987,...,-0.682,0.129,0.683,-0.311,0.383,0.963,-0.848,-1.383,0.193,0.592
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,BC120A-BE,4,0.656,5.500,0.139,-0.961,-0.111,-0.113,0.090,-3.591,...,0.414,1.078,1.990,0.581,1.840,0.733,-0.292,-1.300,0.518,-0.701
245,Sample 60 <5 yr survival(59 months) age 45 e...,4,-0.505,4.908,0.505,0.355,-0.878,-0.539,-0.592,-2.076,...,-0.607,0.285,-0.324,0.531,-0.413,0.459,-0.360,-0.078,-0.164,0.749
246,BC213B-BE,4,0.377,-2.193,-0.475,0.228,-0.415,-1.059,-0.520,-2.844,...,-0.006,-1.010,0.796,-0.465,-0.723,-0.581,0.344,2.407,-0.167,1.042
247,Sample 22 >5 yr survival(106 months) age 45 e...,4,-0.088,0.063,0.234,0.196,-0.194,1.419,-1.024,-0.604,...,0.094,-1.571,0.302,-0.539,-0.326,-0.460,0.223,0.719,-0.828,0.052


In [57]:
# Prepare X and y with features and ground truth
X = dataset.drop(columns=["Class","CLID"])
y = dataset["Class"]

In [58]:
# now split the data into training data and test data (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [59]:
# create our model and fit it to our training data
clf = SVC(kernel='linear',C=1) # default values for kernel and C 
clf.fit(X_train, y_train) 
# make predictions on test data
predicted = clf.predict(X_test)
target_names = ['Luminal B','Luminal A','Normal-like','Basal-like','HER2+']

In [60]:
# print confusion matrix. 
# **Note** since the ground truth is not binary (has 3 labels), the confusion matrix is a 3x3 matrix
print(colored('confusion matrix:\n', 'green'), metrics.confusion_matrix(y_test, predicted))

# print classifier accuracy
print(colored('\naccuracy:', 'blue'), metrics.accuracy_score(y_test, predicted))

# print classification report (Precision, reall, and F1 score for each label, and average)
print(colored('\nclassification report:\n', 'green'),metrics.classification_report(y_test, predicted,target_names = target_names))

# print Accuracy per class
cm = metrics.confusion_matrix(y_test, predicted)
#Now the normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#The diagonal entries are the accuracies of each class
print(colored('\nAccuracy per class:', 'red'),cm.diagonal())

[32mconfusion matrix:
[0m [[ 6  1  0  0  0]
 [ 0 20  0  0  0]
 [ 0  0  2  0  0]
 [ 0  0  1 13  1]
 [ 0  0  0  0  6]]
[34m
accuracy:[0m 0.94
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B       1.00      0.86      0.92         7
   Luminal A       0.95      1.00      0.98        20
 Normal-like       0.67      1.00      0.80         2
  Basal-like       1.00      0.87      0.93        15
       HER2+       0.86      1.00      0.92         6

    accuracy                           0.94        50
   macro avg       0.90      0.94      0.91        50
weighted avg       0.95      0.94      0.94        50

[31m
Accuracy per class:[0m [0.85714286 1.         1.         0.86666667 1.        ]


Hyperparameter tuning using GridSearchCV

In [61]:
#Evaluate each tuned model by averaging 10 runs of stratified 10-fold cross-validation.
#Capture overall accuracy plus accuracy, precision, and recall by cancer type.
kernel = ['linear', 'rbf', 'poly']
parameters = {'kernel':('linear', 'rbf', 'poly'),'C':[1, 5, 10]}
model_parameters = {
    'par1': {
        'kernel' : ['linear'], 
         'C':[1, 5, 10]
    },
    'par2': {
        'kernel':['rbf'],
         'C':[1, 5, 10]
    },
    'par2': {
        'kernel':['poly'],
         'C':[1, 5, 10]
        
    }
}

svc = SVC()
cv = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)

for model_name, parameters in model_parameters.items():
    
    clf = GridSearchCV(svc, parameters, cv=cv,n_jobs=-1) # n_jobs -> number of parallel jobs
                                                   # -1 -> whatever the architecture allows
    clf.fit(X_train, y_train)

    print(colored('\nBest parameters:', 'green'), clf.best_params_,"\n") # print best parameters

    # make predictions on test data
    predicted = clf.predict(X_test)

    # print accuracy
    print(colored('\naccuracy:', 'green'), metrics.accuracy_score(y_test, predicted))

    # print precision and recall statistics
    print(colored('\nclassification report:\n', 'green'),metrics.classification_report(y_test, predicted,target_names = target_names))

    # print confusion matrix
    print(colored('confusion matrix:\n', 'green'),metrics.confusion_matrix(y_test, predicted))
    
    # print Accuracy per class
    cm = metrics.confusion_matrix(y_test, predicted)
    #Now the normalize the diagonal entries
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    #The diagonal entries are the accuracies of each class
    print(colored('\nAccuracy per class:', 'red'),cm.diagonal())

[32m
Best parameters:[0m {'C': 1, 'kernel': 'linear'} 

[32m
accuracy:[0m 0.94
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B       1.00      0.86      0.92         7
   Luminal A       0.95      1.00      0.98        20
 Normal-like       0.67      1.00      0.80         2
  Basal-like       1.00      0.87      0.93        15
       HER2+       0.86      1.00      0.92         6

    accuracy                           0.94        50
   macro avg       0.90      0.94      0.91        50
weighted avg       0.95      0.94      0.94        50

[32mconfusion matrix:
[0m [[ 6  1  0  0  0]
 [ 0 20  0  0  0]
 [ 0  0  2  0  0]
 [ 0  0  1 13  1]
 [ 0  0  0  0  6]]
[31m
Accuracy per class:[0m [0.85714286 1.         1.         0.86666667 1.        ]
[32m
Best parameters:[0m {'C': 5, 'kernel': 'poly'} 

[32m
accuracy:[0m 0.78
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [62]:
# get r2 cross validation scores
#SVC
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='r2')
    
print(scores)
print('Mean: ', np.mean(scores)) 
print('Standard deviation: ', np.std(scores))  

[0.20918367 0.23469388 0.35672515 0.35672515 0.59064327 0.21052632
 0.61988304 0.61988304 0.41520468 0.18970588]
Mean:  0.38031740766482025
Standard deviation:  0.1664347929133342


In [63]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')

# Fit our data
Fit = model.fit(X_train,y_train)
y_pred = model.predict(X_test)
#print(y_pred)

# get r2 cross validation scores
scores = cross_val_score(model, X_train, y_train, cv=10, scoring='r2')
    
print(scores)
print('Mean: ', np.mean(scores)) 
print('Standard deviation: ', np.std(scores))  

[0.9744898  1.         0.97076023 0.97076023 1.         0.70760234
 0.94152047 0.9122807  0.94152047 0.97205882]
Mean:  0.9390993063892225
Standard deviation:  0.08129159955019517


In [64]:

#Confusion matrix for MLR
print(colored('confusion matrix:\n', 'green'), metrics.confusion_matrix(y_test, y_pred))

# print classifier accuracy
print(colored('\naccuracy:', 'blue'), metrics.accuracy_score(y_test, y_pred))

# print classification report (Precision, reall, and F1 score for each label, and average)
print(colored('\nclassification report:\n', 'green'),metrics.classification_report(y_test, y_pred,target_names = target_names))

# print Accuracy per class
cm = metrics.confusion_matrix(y_test, y_pred)
#Now the normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#The diagonal entries are the accuracies of each class
print(colored('\nAccuracy per class:', 'red'),cm.diagonal())


[32mconfusion matrix:
[0m [[ 6  1  0  0  0]
 [ 0 20  0  0  0]
 [ 0  0  2  0  0]
 [ 0  0  1 14  0]
 [ 0  0  0  0  6]]
[34m
accuracy:[0m 0.96
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B       1.00      0.86      0.92         7
   Luminal A       0.95      1.00      0.98        20
 Normal-like       0.67      1.00      0.80         2
  Basal-like       1.00      0.93      0.97        15
       HER2+       1.00      1.00      1.00         6

    accuracy                           0.96        50
   macro avg       0.92      0.96      0.93        50
weighted avg       0.97      0.96      0.96        50

[31m
Accuracy per class:[0m [0.85714286 1.         1.         0.93333333 1.        ]


In [65]:
from sklearn.linear_model import Ridge

# Train model with default alpha=1
ridge = Ridge(alpha=1)
ridge.fit(X_train, y_train)

# get cross val scores
scores = cross_val_score(ridge, X_train, y_train, cv=10, scoring='r2')

print(scores)
print('Mean: ', np.mean(scores)) 
print('Standard deviation: ', np.std(scores))  

[0.1834736  0.13684981 0.57944082 0.7303196  0.07983139 0.27748869
 0.46209443 0.5869003  0.4343926  0.25984838]
Mean:  0.37306396211852605
Standard deviation:  0.20671622503401277


In [66]:
grid = GridSearchCV(estimator=ridge, param_grid={'alpha':[0.01, 0.05, 0.1, 1, 5]}, scoring='r2', n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

# Now import the results into a dataframe 
df = pd.DataFrame(grid.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.016177,0.003931,0.011055,0.00275,0.01,{'alpha': 0.01},0.342128,0.543406,0.428053,0.494664,0.469967,0.455644,0.067937,5
1,0.011992,0.001842,0.008513,0.001304,0.05,{'alpha': 0.05},0.34319,0.544429,0.428883,0.495679,0.4704,0.456516,0.067914,4
2,0.009827,0.000887,0.008288,0.001516,0.1,{'alpha': 0.1},0.344505,0.545697,0.429912,0.496937,0.470938,0.457598,0.067887,3
3,0.009444,0.000836,0.006438,0.000582,1.0,{'alpha': 1},0.366015,0.566513,0.447131,0.517735,0.479994,0.475478,0.067645,2
4,0.008444,0.001462,0.004949,0.001126,5.0,{'alpha': 5},0.430343,0.630018,0.502988,0.581925,0.510585,0.531172,0.068893,1


In [67]:
df[['param_alpha', 'mean_test_score']]

Unnamed: 0,param_alpha,mean_test_score
0,0.01,0.455644
1,0.05,0.456516
2,0.1,0.457598
3,1.0,0.475478
4,5.0,0.531172


In [68]:
#Find the best score and the best parameter (alpha value)
print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

Best Score:  0.5311718672186765
Best Params:  {'alpha': 5}


In [69]:
from sklearn.linear_model import Lasso

# Train model with default alpha=1
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train)

# get cross val scores
scores = cross_val_score(lasso, X_train, y_train, cv=10, scoring='r2')

print(scores)
print('Mean: ', np.mean(scores)) 
print('Standard deviation: ', np.std(scores))  

[0.37606745 0.21609544 0.44678675 0.34438138 0.47402472 0.19697579
 0.31608458 0.47392758 0.15907405 0.4562517 ]
Mean:  0.3459669447526167
Standard deviation:  0.11440957771593507


In [70]:
grid = GridSearchCV(estimator=ridge, param_grid={'alpha':[1e-12, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 20, 50, 100, 500, 1000]}, scoring='r2', n_jobs=-1)
grid_result = grid.fit(X_train, y_train)

df = pd.DataFrame(grid.cv_results_)

print('Best Score: ', grid_result.best_score_)
print('Best Params: ', grid_result.best_params_)

  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,
  dual_coef = linalg.solve(K, y, sym_pos=True,


Best Score:  0.7147965236896334
Best Params:  {'alpha': 100}


In [71]:
df[['param_alpha', 'mean_test_score']]

Unnamed: 0,param_alpha,mean_test_score
0,0.0,0.455424
1,0.0,0.455424
2,0.0,0.455424
3,0.0001,0.455427
4,0.001,0.455446
5,0.01,0.455644
6,0.1,0.457598
7,1.0,0.475478
8,5.0,0.531172
9,10.0,0.573331


print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred,target_names=target_names))

In [72]:
#Confusion matrix for MLR
print(colored('confusion matrix:\n', 'green'), metrics.confusion_matrix(y_test, y_pred))

# print classifier accuracy
print(colored('\naccuracy:', 'blue'), metrics.accuracy_score(y_test, y_pred))

# print classification report (Precision, reall, and F1 score for each label, and average)
print(colored('\nclassification report:\n', 'green'),metrics.classification_report(y_test, y_pred,target_names = target_names))

# print Accuracy per class
cm = metrics.confusion_matrix(y_test, y_pred)
#Now the normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#The diagonal entries are the accuracies of each class
print(colored('\nAccuracy per class:', 'red'),cm.diagonal())

[32mconfusion matrix:
[0m [[ 6  1  0  0  0]
 [ 0 20  0  0  0]
 [ 0  0  2  0  0]
 [ 0  0  1 14  0]
 [ 0  0  0  0  6]]
[34m
accuracy:[0m 0.96
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B       1.00      0.86      0.92         7
   Luminal A       0.95      1.00      0.98        20
 Normal-like       0.67      1.00      0.80         2
  Basal-like       1.00      0.93      0.97        15
       HER2+       1.00      1.00      1.00         6

    accuracy                           0.96        50
   macro avg       0.92      0.96      0.93        50
weighted avg       0.97      0.96      0.96        50

[31m
Accuracy per class:[0m [0.85714286 1.         1.         0.93333333 1.        ]


In [73]:
# Build the model for Decision Tree
par = {'criterion' : ['gini'], 'random_state':[249], 'max_depth' : [None], 'min_samples_leaf':[1]}
#model.fit(X_train,y_train)


In [74]:
clf = GridSearchCV(DecisionTreeClassifier(), par, cv=10)
clf.fit(X_train,y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini'], 'max_depth': [None],
                         'min_samples_leaf': [1], 'random_state': [249]})

In [75]:
#Decision Trees Confusion Matrix
y_predd = clf.predict(X_test)

print(colored('confusion matrix:\n', 'green'), metrics.confusion_matrix(y_test, y_predd))

# print classifier accuracy
print(colored('\naccuracy:', 'blue'), metrics.accuracy_score(y_test, y_predd))

# print classification report (Precision, reall, and F1 score for each label, and average)
print(colored('\nclassification report:\n', 'green'),metrics.classification_report(y_test, y_predd,target_names = target_names))

# print Accuracy per class
cm = metrics.confusion_matrix(y_test, y_predd)
#Now the normalize the diagonal entries
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#The diagonal entries are the accuracies of each class
print(colored('\nAccuracy per class:', 'red'),cm.diagonal())

[32mconfusion matrix:
[0m [[ 3  3  0  0  1]
 [ 6 13  1  0  0]
 [ 0  1  1  0  0]
 [ 2  0  0 12  1]
 [ 0  0  0  2  4]]
[34m
accuracy:[0m 0.66
[32m
classification report:
[0m               precision    recall  f1-score   support

   Luminal B       0.27      0.43      0.33         7
   Luminal A       0.76      0.65      0.70        20
 Normal-like       0.50      0.50      0.50         2
  Basal-like       0.86      0.80      0.83        15
       HER2+       0.67      0.67      0.67         6

    accuracy                           0.66        50
   macro avg       0.61      0.61      0.61        50
weighted avg       0.70      0.66      0.68        50

[31m
Accuracy per class:[0m [0.42857143 0.65       0.5        0.8        0.66666667]


In [76]:
#Decision Trees Mean and Standard Deviation
scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='r2')
    
print(scores)
print('Mean: ', np.mean(scores)) 
print('Standard deviation: ', np.std(scores)) 

[0.41326531 0.74489796 0.73684211 0.64912281 0.73684211 0.15204678
 0.21052632 0.32748538 0.9122807  0.13382353]
Mean:  0.5017132993548297
Standard deviation:  0.272136523489935


In [77]:
f, pval = stats.ttest_ind(y_predd,y_pred)
print("Decision Tree vs MultinomialLogisticRegression")
print('F statistic: %.3f' % f)
print('p value: %.3f' % pval)
if pval <0.05:
  print("we reject null hypothesis")
else:
  print("we accept null hypothesis")

Decision Tree vs MultinomialLogisticRegression
F statistic: -0.446
p value: 0.657
we accept null hypothesis


In [78]:
f, pval = stats.ttest_ind(y_pred,predicted)
print("MLR vs SVC")
print('F statistic: %.3f' % f)
print('p value: %.3f' % pval)
if pval <0.05:
  print("we reject null hypothesis")
else:
  print("we accept null hypothesis")

MLR vs SVC
F statistic: 1.080
p value: 0.283
we accept null hypothesis


In [79]:
f, pval = stats.ttest_ind(y_predd,predicted)
print("Decision Tree vs SVC")
print('F statistic: %.3f' % f)
print('p value: %.3f' % pval)
if pval <0.05:
  print("we reject null hypothesis")
else:
  print("we accept null hypothesis")

Decision Tree vs SVC
F statistic: 0.555
p value: 0.580
we accept null hypothesis
