In [None]:
import pandas as pd
import numpy as np
import importlib
import custom_implementations as custom_imp
import matplotlib.pyplot as plt

dataset = pd.read_csv("../Datasets/heart.txt", sep='\s+')
cols_of_interest = ['age','chest_pain','resting_blood_pressure','serum_chol','resting_electro_results','maximum_heart_rate,exercise','induced_angina','oldpeak','slope_peak_exercise_st','num_major_vessels','absent']
dataset = dataset[cols_of_interest]
# Not Absent: 0
# Absent: 1
minority_class = 1
majority_class = 0
classes = 'absent'
cols = list(dataset.columns.values)
class_index = cols.index(classes)

print 'Number of not absent: ', len(dataset[dataset['absent'] == 0])
print 'Number of absent: ', len(dataset[dataset['absent'] == 1])

print(cols)

## Find best K for KNN

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Split into data and labels
x = preprocessing.normalize(dataset.iloc[:,:-1])
y = dataset.iloc[:,-1].values

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.25)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

## KNN on imbalanced dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Not Absent', 'Absent']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print 'Best SVM hyperparameters: ', svm_best_params

## SVM on imbalanced dataset

In [None]:
from sklearn import svm
from sklearn import metrics

# Fit model
# C: penalty parameter of the error term, i.e. slack variable
# kernel (linear, rbf)
# gamma: kernel coefficient for rbf, poly, sigmoid
# tol: tolerance for stopping criterion
# max_iter: limit on eopochs
# random_state: seed when shuffling

kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply SMOTE to imbalanced dataset

In [None]:
## Use SMOTE on N% of the dataset with k neighbours
smote_percentages = [50,100,200,300]

x = dataset.iloc[:,:-1].values
x = preprocessing.normalize(x)
y = dataset.iloc[:,-1].values
x_majority = preprocessing.normalize(dataset[dataset[classes] == 1].iloc[:,:-1].values)
x_minority = preprocessing.normalize(dataset[dataset[classes] == 0].iloc[:,:-1].values)
y_majority = dataset[dataset[classes] == 1].iloc[:,-1].values
y_minority = dataset[dataset[classes] == 0].iloc[:,-1].values

print 'Percentage to SMOTE: ', smote_percentages[3]
smoted_samples = custom_imp.smote(x_minority, smote_percentages[3], k_best)
print 'Number of synthetic samples SMOTEd: ', len(smoted_samples)
updated_x_minority = np.concatenate((x_minority, smoted_samples), axis=0)
print 'Number of minority after: %s' % len(updated_x_minority)

plt.plot([x_majority[3]], [x_majority[4]], 'g^', [updated_x_minority[3]], [updated_x_minority[4]], 'r*')

# Update x and y for smote
x_smote = np.concatenate((x, smoted_samples), axis=0)
smote_y = np.full((len(smoted_samples)), minority_class)
y_smote = np.concatenate((y, smote_y), axis=0)

## KNN after SMOTE

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_smote,y_smote,random_state=1, test_size=0.25, shuffle=True)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Not Absent', 'Absent']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM after SMOTE

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## TOMEK Links on imbalanced dataset

In [None]:
# Declare column variables
classes = 'absent'
class_x = 1
class_y = 2
label_0 = 'Not Absent' # Change depending on which class 0 belongs to
label_1 = 'Absent' # Change depending on which class 1 belongs to

x_majority = dataset[dataset[classes] == 1].iloc[:,:-1]
x_minority = dataset[dataset[classes] == 0].iloc[:,:-1]
y_majority = dataset[dataset[classes] == 1].iloc[:,-1]
y_minority = dataset[dataset[classes] == 0].iloc[:,-1]

print 'Number of majority before: ', len(x_majority)

num_to_remove = len(x_majority) - len(x_minority)
if num_to_remove > len(x_minority):
    num_to_remove = len(x_minority)
tomek_dataset = custom_imp.tomek(x_majority.values, x_minority.values, y_majority.values, y_minority.values, num_to_remove, classes, 1)
tomek_df = pd.DataFrame(tomek_dataset)

print 'Number of majority after: ', len(tomek_df[tomek_df[10] == 0].iloc[:,:-1])

# Split into data and labels
x_tomek = preprocessing.normalize(tomek_df.iloc[:,:-1].values)
y_tomek = tomek_df.iloc[:,-1].values

x_train, x_test, y_train, y_test = train_test_split(x_tomek,y_tomek,random_state=1, test_size=0.25, shuffle=True)
print 'done'

## KNN after TOMEK Links

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Not Absent', 'Absent']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM after TOMEK Links

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## SMOTE + TOMEK Links on imbalanced dataset

In [None]:
# Declare column variables
classes = 'absent'
class_x = 1
class_y = 2
label_0 = 'Not Absent' # Change depending on which class 0 belongs to
label_1 = 'Absent' # Change depending on which class 1 belongs to

# Separate into features and labels depending upon classification label
x_majority = preprocessing.normalize(dataset[dataset[classes] == 1].iloc[:,:-1])
x_minority = preprocessing.normalize(dataset[dataset[classes] == 0].iloc[:,:-1])
y_majority = dataset[dataset[classes] == 1].iloc[:,-1]
y_minority = dataset[dataset[classes] == 0].iloc[:,-1]

print 'Number of majority before: ', len(x_majority)
print 'Number of minority before: ', len(x_minority)

smote_tomek_dataset = custom_imp.smote_tomek(x_majority, x_minority, y_majority.values, y_minority.values, majority_class, minority_class, smote_percentages[3], k_best, classes)

smote_tomek_df = pd.DataFrame(smote_tomek_dataset)

x_smote_tomek = smote_tomek_df.iloc[:,:-1].values
y_smote_tomek = smote_tomek_df.iloc[:, len(dataset.columns)-1].values

print 'Number of majority after: ', len(smote_tomek_df[smote_tomek_df[10] == 1])
print 'Number of minority after: ', len(smote_tomek_df[smote_tomek_df[10] == 0])

x_train, x_test, y_train, y_test = train_test_split(x_smote_tomek,y_smote_tomek,random_state=1, test_size=0.25, shuffle=True)

## KNN after SMOTE + TOMEK Links

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Not Absent', 'Absent']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM after SMOTE + TOMEK Links

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)