In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# from imblearn.over_sampling import SMOTE #for SMOTE -> install package using: conda install -c conda-forge imbalanced-learn 
from scipy import stats, integrate
import matplotlib.pyplot as plt
import ggplot
import scipy
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.preprocessing import OneHotEncoder
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

import pylab as pl
from itertools import cycle
from sklearn import cross_validation
from sklearn.svm import SVC

features_list = ['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','class']
dataset1=pd.read_csv("Heart_Disease_Data.csv")

dataset1 = dataset1.convert_objects(convert_numeric=True)
dataset1.astype('float')

dataset1 = dataset1.fillna(value=0)

# based on https://pdfs.semanticscholar.org/daa0/f01f96a89fcfc5f41a2da67fb2a8966900ab.pdf
Genetic_Based_Decision = dataset1[['cp','trestbps', 'restecg', 'thalach', 'ca', 'thal']]

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
# SVM requires that each data instance is represented as a vector of real numbers
# If you already have numeric dtypes (int8|16|32|64,float64,boolean) you can convert it to another "numeric" dtype using Pandas .astype() method. Demo: In [90]: df = pd.DataFrame(np.random.randint(10**5,10**7,(5,3)),columns=list('abc'), dtype=np.int64) In [91]: df Out[91]: a b c 0 9059440 9590567 2076918 1 5861102 4566089 1947323 2 6636568 162770 2487991 3 6794572 5236903 5628779 4 470121 4044395 4546794 In [92]: df.dtypes Out[92]: a int64 b int64 c int64 dtype: object In [93]: df['a'] = df['a'].astype(float) In [94]: df.dtypes Out[94]: a float64 b int64 c int64 dtype: object It won't work for object (string) dtypes, that can't be converted to numbers: In [95]: df.loc[1, 'b'] = 'XXXXXX' In [96]: df Out[96]:...
# Just make everything numeric for ease
dataset1 = dataset1.convert_objects(convert_numeric=True)
dataset1 = dataset1.astype('float')

# Two variables are discrete/ordinal: ca (number of major vessels colored by fluoroscopy) and num (diagnosis of heart disease)
# Three can be directly viewed as 1 hot (because binary): 'sex':'male', 'fbs':'fasting blood sugar', 'exang':'exercise induced angina'

# which leaves 4 for one-hot encoding. problem is that the values aren't unique, so have to manually
# make extra columns:

dataset1["cp"] = dataset1["cp"].replace([1,2,3,4], ["typical angina", "atypical angina", "non-angina", "asymptomatic angina"])
dataset1["restecg"] = dataset1["restecg"].replace([0,1,2], ["normalresecg", "ST-T wave abnormality", "left ventricular hypertrophy"])
dataset1["slop"] = dataset1["slop"].replace([1,2,3], ["upsloping", "flat", "downsloping"])
dataset1["thal"] = dataset1["thal"].replace([3,6,7], ["normalthal", "fixed defect", "reversible defect"])

x = dataset1[['cp', 'restecg', 'slop', 'thal']]
for column in ['cp', 'restecg', 'slop', 'thal']:
    one_hot = pd.get_dummies(dataset1[column])
    dataset1 = dataset1.drop(column, axis=1)
    dataset1 = dataset1.join(one_hot)

  after removing the cwd from sys.path.


In [3]:
### Extract features and labels from dataset for local testing:
dataset1.dropna(inplace=True, axis=0, how="any")
Y=dataset1["pred_attribute"]
dataset1 = dataset1.drop("pred_attribute", axis=1)
X=dataset1

In [4]:
# evaluate the model by splitting into train and test sets  #Edit by ryan, we aim to do 3 traditional sets in the end, this first split is 80/20
features_train, features_test, labels_train, labels_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [5]:
import collections

list1 = []
for i in labels_train:
    list1.append(i)
counter=collections.Counter(list1)
print(counter)

list2 = []
for i in labels_test:
    list2.append(i)
counter=collections.Counter(list2)
print(counter)

Counter({0.0: 129, 1.0: 42, 3.0: 31, 2.0: 30, 4.0: 10})
Counter({0.0: 35, 1.0: 13, 2.0: 6, 3.0: 4, 4.0: 3})


In [6]:
# Check
print(len(features_train)/(len(features_train)+ len(features_test)))

0.7986798679867987


We have an relatively small dataset. Therefore, we should do our feature selection based on a cross-
validated set. We will check this assumption by comparing the scores on a cross-validated set vs the simple split.

In [7]:
features_train_cross, features_test_cross, labels_train_cross, labels_test_cross = train_test_split(X, Y, test_size=0.2, random_state=0)

### SMOTE for SVM - Balancing only on the training set, not the validation set  [This is for the traditional training -not the cross validated one]

In [93]:
#further divide the 'traditional' non-cross set into training 80/20  for pure training and cross validation  
features_train_notoversampled, features_validate, labels_train_notoversampled, labels_validate = train_test_split(features_train, labels_train, test_size = .2, random_state=0)

sm = SMOTE(random_state=0, ratio = 1.0, kind= 'svm' )
#x_train_res, y_train_res = sm.fit_sample(x_train, y_train)
features_train_oversampled, labels_train_oversampled = sm.fit_sample(features_train_notoversampled, labels_train_notoversampled)

#re-enter into original variables
##features_train = features_train_oversampled
##labels_train = labels_train_oversampled

#Below 2 lines if we want to want to force the array back into dataframe    
##features_train = pd.DataFrame(features_train_oversampled,columns=["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slop","ca","thal"])
##labels_train = pd.DataFrame(labels_train_oversampled,columns=["pred_attribute"])



In [8]:
#SVC Models are only any good when the data is scaled. Lets scale the data and build the model
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing.data import QuantileTransformer
scaler = MinMaxScaler()
Standard_scaler = StandardScaler()
Robust_scaler = preprocessing.RobustScaler(quantile_range=(25, 75))
Quantile_scalar = preprocessing.QuantileTransformer(output_distribution='normal')

features_train = Standard_scaler.fit_transform(features_train)
features_test = Standard_scaler.transform(features_test)

features_train

array([[-1.13185208,  0.67015058, -1.27885728, ..., -0.27487371,
         0.99176941, -0.84635221],
       [ 0.07286213,  0.67015058,  1.57668306, ..., -0.27487371,
        -1.0082989 ,  1.1815412 ],
       [-0.03665734,  0.67015058, -0.70774921, ..., -0.27487371,
        -1.0082989 ,  1.1815412 ],
       ...,
       [-2.11752735, -1.49220195,  0.32024531, ..., -0.27487371,
         0.99176941, -0.84635221],
       [-0.47473524,  0.67015058,  1.00557499, ..., -0.27487371,
        -1.0082989 ,  1.1815412 ],
       [ 0.51094003, -1.49220195,  2.37623435, ..., -0.27487371,
         0.99176941, -0.84635221]])

In [9]:
from sklearn.metrics import accuracy_score
from sklearn import grid_search
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.metrics import mean_squared_error

def checkmetrics(pred, labels_test, name):
    print('The accuracy is of a', name, 'is: ', accuracy_score(pred, labels_test))
    # print 'if everyone had 0 score: ', float(float(len(pred))-float(numberpoi))/float(len(pred))
    matrix = confusion_matrix(labels_test, pred)
#  print('There are', matrix[0][0], 'healthy people correctly identified vs', matrix[2][2] +matrix[3][3] +matrix[4][4] +matrix[1][1], 'sick ones. See:\n', matrix)
    print(matrix)
    print(classification_report(pred, labels_test))
    final_mse = mean_squared_error(labels_test, pred)
    final_rmse = np.sqrt(final_mse)
    print('mean square error', final_rmse)



## Feature selection?

In [10]:
# Feature selection using RFECV to pick best features,
from sklearn.svm import SVR
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RandomizedLasso
from sklearn.feature_selection import RFECV
rlasso = RandomizedLasso(alpha=0.025)
names = features_list
rlasso.fit(features_train, labels_train)
 
print("Features sorted by their score using lasso:")
print(sorted(zip(map(lambda x: round(x, 4), rlasso.scores_), 
                 names), reverse=True))

#use linear regression as the model
lr = LinearRegression()
#rank all features, i.e continue the elimination until the last one
rfe = RFE(lr, n_features_to_select=1)
rfe.fit(X,Y)
 
print("Features sorted by their score using Linear Regression:")
print(sorted(zip(map(lambda x: round(x, 4), rfe.ranking_), names)))

Features sorted by their score using lasso:
[(0.495, 'oldpeak'), (0.46, 'exang'), (0.45, 'thalach'), (0.41, 'restecg'), (0.245, 'fbs'), (0.005, 'ca'), (0.0, 'trestbps'), (0.0, 'thal'), (0.0, 'slope'), (0.0, 'sex'), (0.0, 'cp'), (0.0, 'class'), (0.0, 'chol'), (0.0, 'age')]
Features sorted by their score using Linear Regression:
[(1, 'oldpeak'), (2, 'class'), (6, 'exang'), (7, 'restecg'), (8, 'sex'), (9, 'thal'), (10, 'thalach'), (14, 'ca'), (18, 'slope'), (19, 'chol'), (20, 'cp'), (21, 'age'), (22, 'fbs'), (23, 'trestbps')]




## Linear kernel

In [11]:
parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5], 
             'gamma': [0.00001,0.01,0.05,0.1,0.2,0.5,1,2,3,4,5], 
             "class_weight": ['balanced', None]}
SVM = svm.SVC(kernel="linear")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score:", str(grid_search.score(features_train, labels_train)), "with parameters:", grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - sq hinge - Validate - support vector machine linear')

# Compare with one-versus all:
parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5],
             "class_weight": ['balanced', None], 
             'gamma': [0.00001,0.01,0.05,0.1,0.2,0.5,1,2,3,4,5], 
             'decision_function_shape': ['ovo', 'ovr']}
SVM = svm.SVC(kernel="linear")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score for ovo:", str(grid_search.score(features_train, labels_train)), 'with parameters:', grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - sq hinge, one vs one - Validate - support vector machine linear')

The train score: 0.7396694214876033 with parameters: {'C': 4, 'class_weight': None, 'gamma': 1e-05}
The accuracy is of a No SMOTE - sq hinge - Validate - support vector machine linear is:  0.5737704918032787
[[31  2  1  1  0]
 [ 7  2  2  2  0]
 [ 4  1  1  0  0]
 [ 0  2  0  1  1]
 [ 0  1  0  2  0]]
             precision    recall  f1-score   support

        0.0       0.89      0.74      0.81        42
        1.0       0.15      0.25      0.19         8
        2.0       0.17      0.25      0.20         4
        3.0       0.25      0.17      0.20         6
        4.0       0.00      0.00      0.00         1

avg / total       0.67      0.57      0.61        61

mean square error 1.06355420218417
The train score for ovo: 0.7396694214876033 with parameters: {'C': 4, 'class_weight': None, 'decision_function_shape': 'ovo', 'gamma': 1e-05}
The accuracy is of a No SMOTE - sq hinge, one vs one - Validate - support vector machine linear is:  0.5737704918032787
[[31  2  1  1  0]
 [ 7  2  2  

Try different model (standard one vs. rest) loss not automatically being squared hinge:

In [12]:

parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5],
            "class_weight": ['balanced', None]}
SVM = LinearSVC(loss="hinge")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score: ", str(grid_search.score(features_train, labels_train)), 'with parameters:', grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - hinge - Validate - support vector machine linear')

# Compare with Cramer:
parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5],
            "class_weight": ['balanced', None],
             'multi_class':['ovr', 'crammer_singer']}
SVM = LinearSVC(loss="hinge")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score: ", str(grid_search.score(features_train, labels_train)), 'with parameters:', grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - hinge, one vs rest - Validate - support vector machine linear')

The train score:  0.6570247933884298 with parameters: {'C': 0.5, 'class_weight': None}
The accuracy is of a No SMOTE - hinge - Validate - support vector machine linear is:  0.5573770491803278
[[33  0  1  1  0]
 [ 8  0  1  3  1]
 [ 4  0  0  1  1]
 [ 0  1  0  1  2]
 [ 1  0  0  2  0]]
             precision    recall  f1-score   support

        0.0       0.94      0.72      0.81        46
        1.0       0.00      0.00      0.00         1
        2.0       0.00      0.00      0.00         2
        3.0       0.25      0.12      0.17         8
        4.0       0.00      0.00      0.00         4

avg / total       0.74      0.56      0.64        61

mean square error 1.201092398951751
The train score:  0.6446280991735537 with parameters: {'C': 0.1, 'class_weight': None, 'multi_class': 'ovr'}
The accuracy is of a No SMOTE - hinge, one vs rest - Validate - support vector machine linear is:  0.5737704918032787
[[33  0  1  1  0]
 [ 9  0  1  3  0]
 [ 4  0  1  1  0]
 [ 2  0  1  1  0]
 [ 1  0 

  'recall', 'true', average, warn_for)


## Nonlinear SVM Classification
Polynominal features

Note that when there are multiple features, Polynomial Regression is capable of finding relationships
between features (which is something a plain Linear Regression model cannot do). This is made possible
by the fact that PolynomialFeatures also adds all combinations of features up to the given degree. 

In [13]:
# HOW GRIDSEARCH SHOULD WORK
parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5], 
             'gamma': [0.00001,0.01,0.05,0.1,0.2,0.5,1,2,3,4,5], 
             "class_weight": ['balanced', None],
             "degree": [1,2,3],
             "coef0": [1,10]}
SVM = svm.SVC(kernel="poly")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score:", str(grid_search.score(features_train, labels_train)), "with parameters:", grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - sq hinge - Validate - support vector machine linear')

# No need to compare with one vs all.

The train score: 0.8016528925619835 with parameters: {'C': 0.5, 'class_weight': None, 'coef0': 10, 'degree': 2, 'gamma': 0.05}
The accuracy is of a No SMOTE - sq hinge - Validate - support vector machine linear is:  0.5901639344262295
[[30  3  1  1  0]
 [ 7  3  0  3  0]
 [ 4  0  2  0  0]
 [ 0  1  1  1  1]
 [ 0  1  0  2  0]]
             precision    recall  f1-score   support

        0.0       0.86      0.73      0.79        41
        1.0       0.23      0.38      0.29         8
        2.0       0.33      0.50      0.40         4
        3.0       0.25      0.14      0.18         7
        4.0       0.00      0.00      0.00         1

avg / total       0.66      0.59      0.62        61

mean square error 1.0558191598757127


## RBF Kernel

In [14]:
# Just like the polynomial features method, the similarity features method can be useful with any Machine
# Learning algorithm, but it may be computationally expensive to compute all the additional features,
# especially on large training sets. However, once again the kernel trick does its SVM magic: it makes it
# possible to obtain a similar result as if you had added many similarity features, without actually having to
# add them

parameters ={'C': [0.1,0.2,0.5,1,2,3,4,5], 
             'gamma': [0.00001,0.01,0.05,0.1,0.2,0.5,1,2,3,4,5], 
             "class_weight": ['balanced', None]}
SVM = svm.SVC(kernel="rbf")
grid_search = GridSearchCV(SVM, parameters, cv=10)
grid_search.fit(features_train, labels_train)

print("The train score:", str(grid_search.score(features_train, labels_train)), "with parameters:", grid_search.best_params_)

pred = grid_search.predict(features_test)
checkmetrics(pred, labels_test, 'No SMOTE - sq hinge - Validate - support vector machine linear')

# no need to compare with one vs all

The train score: 0.6487603305785123 with parameters: {'C': 1, 'class_weight': None, 'gamma': 0.01}
The accuracy is of a No SMOTE - sq hinge - Validate - support vector machine linear is:  0.6721311475409836
[[34  1  0  0  0]
 [ 8  5  0  0  0]
 [ 4  2  0  0  0]
 [ 0  2  0  2  0]
 [ 1  1  0  1  0]]
             precision    recall  f1-score   support

        0.0       0.97      0.72      0.83        47
        1.0       0.38      0.45      0.42        11
        2.0       0.00      0.00      0.00         0
        3.0       0.50      0.67      0.57         3
        4.0       0.00      0.00      0.00         0

avg / total       0.84      0.67      0.74        61

mean square error 1.0


  'recall', 'true', average, warn_for)
