In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import custom_implementations as custom_imp
import importlib

dataset = pd.read_csv('../Datasets/indian_liver_patient_disease.csv',
                     index_col = False)

cols = list(dataset)

# Change classification values to 0,1
dataset['selector_field'] = dataset['selector_field'].apply({1:0, 2:1}.get)
dataset['gender'] = dataset['gender'].apply({'Male':0, 'Female':1}.get)

# Remove NaN
dataset = dataset[np.isfinite(dataset['ag_ratio_alb'])]

# Non liver patient: 1
# Liver patient: 0
minority_class = 1
majority_class = 0

# Imbalance dataset
# remove_n = 100
# minority_indices = dataset[dataset['selector_field'] == 1]
# drop_indices = np.random.choice(minority_indices.index, remove_n, replace=False)
# dataset = dataset.drop(drop_indices)

# Set normalise
normalise = True

# Get classification count
liver_patient_count = len(dataset[dataset['selector_field'] == 0])
non_liver_patient_count = len(dataset[dataset['selector_field'] == 1])

#print dataset
print "No. Features: ", len(dataset.columns)
print "Liver patient (0): ",liver_patient_count
print "Non liver patient (1): ",non_liver_patient_count
print "Total: ", liver_patient_count + non_liver_patient_count

print dataset

## Plot features

In [None]:
import matplotlib.pyplot as plt

class_index = len(dataset.columns)

# Separate into features and labels depending upon classification label
x_liver = dataset[dataset['selector_field'] == 0].iloc[:,0:class_index-1]
x_non_liver = dataset[dataset['selector_field'] == 1].iloc[:,0:class_index-1]
y_liver = dataset[dataset['selector_field'] == 0].iloc[:,-1]
y_non_liver = dataset[dataset['selector_field'] == 1].iloc[:,-1]

if(normalise):
    x_liver = preprocessing.normalize(x_liver)
    x_non_liver = preprocessing.normalize(x_non_liver)
    plt.plot([x_liver[3]], [x_liver[0]], 'g^', [x_non_liver[3]], [x_non_liver[0]], 'r*')
else:
    plt.plot([x_liver['direct_biliruin'].values], [x_liver['age'].values], 'g^', [x_non_liver['direct_biliruin'].values], [x_non_liver['age'].values], 'r*')

    
plt.ylabel('age')
plt.xlabel('direct_bil')
plt.title('direct_biliruin vs age')
plt.show()


## Split data into train/test

In [None]:
from sklearn.model_selection import train_test_split
# Split into data and labels
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, len(dataset.columns)-1].values

if(normalise):
    x = preprocessing.normalize(x)

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.25, shuffle=True)

## Find K for KNN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Split into data and labels
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, len(dataset.columns)-1].values

if(normalise):
    x = preprocessing.normalize(x)

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.25, shuffle=True)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print "Best K: ", k_best

## KNN on imbalanced dataset

In [None]:
# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Liver', 'Non-Liver']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM Parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM on imbalanced dataset

In [None]:
from sklearn import svm

# Fit model
# C: penalty parameter of the error term, i.e. slack variable
# kernel (linear, rbf)
# gamma: kernel coefficient for rbf, poly, sigmoid
# tol: tolerance for stopping criterion
# max_iter: limit on eopochs
# random_state: seed when shuffling

kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply SMOTE

In [None]:
## Use SMOTE on N% of the dataset with k neighbours
smote_percentages = [50,100,200,300]

print 'Percentage to SMOTE: ', smote_percentages[3]
smoted_samples = custom_imp.smote(x_non_liver, smote_percentages[3], k_best)
print 'Number of synthetic samples SMOTEd: ', len(smoted_samples)
updated_x_non_liver = np.concatenate((x_non_liver, smoted_samples), axis=0)
print 'Number of minority after: %s' % len(updated_x_non_liver)

if(normalise):
    plt.plot([x_liver[0]], [x_liver[3]], 'g^', [updated_x_non_liver[0]], [updated_x_non_liver[3]], 'r*')
else:
    plt.plot([x_liver['age'].values], [x_liver['direct_biliruin'].values], 'g^', [updated_x_non_liver['age'].values], [updated_x_non_liver['direct_biliruin'].values], 'r*')

# Update x and y for smote
x_smote = np.concatenate((x, smoted_samples), axis=0)
smote_y = np.full((len(smoted_samples)), minority_class)
y_smote = np.concatenate((y, smote_y), axis=0)

## KNN after SMOTE

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_smote,y_smote,random_state=1, test_size=0.25, shuffle=True)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM Hyperparameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM after SMOTE

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply Tomek Links

In [None]:
# Declare column variables
classes = 'selector_field'
class_x = 0
class_y = 3
label_0 = 'Liver' # Change depending on which class 0 belongs to
label_1 = 'Non-liver' # Change depending on which class 1 belongs to

num_to_remove = len(x_liver) - len(x_non_liver)
if(num_to_remove > len(x_non_liver)):
    num_to_remove = len(x_non_liver)
    
tomek_dataset = custom_imp.tomek(x_liver, x_non_liver, y_liver.values, y_non_liver.values, num_to_remove, classes, 1)
tomek_df = pd.DataFrame(tomek_dataset)

# Split into data and labels
x_tomek = tomek_df.iloc[:, :-1].values
y_tomek = tomek_df.iloc[:, len(dataset.columns)-1].values

print sum(tomek_df[10] == 1)
print sum(tomek_df[10] == 0)

x_train, x_test, y_train, y_test = train_test_split(x_tomek,y_tomek,random_state=1, test_size=0.25, shuffle=True)

## KNN after Tomek

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## SVM after Tomek

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply SMOTE + Tomek

In [None]:
# Declare column variables
classes = 'selector_field'
class_x = 0
class_y = 3
label_0 = 'Liver' # Change depending on which class 0 belongs to
label_1 = 'Non-liver' # Change depending on which class 1 belongs to

smote_tomek_dataset = custom_imp.smote_tomek(x_liver, x_non_liver, y_liver.values, y_non_liver.values, majority_class, minority_class, smote_percentages[3], k_best, classes)
smote_tomek_df = pd.DataFrame(tomek_dataset)

# Split into data and labels
x_smote_tomek = smote_tomek_df.iloc[:, :-1].values
y_smote_tomek = smote_tomek_df.iloc[:, len(dataset.columns)-1].values

print sum(tomek_df[10] == 1)
print sum(tomek_df[10] == 0)

x_train, x_test, y_train, y_test = train_test_split(x_smote_tomek,y_smote_tomek,random_state=1, test_size=0.25, shuffle=True)

## KNN after SMOTE + Tomek

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Liver', 'Non-Liver']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## SVM after SMOTE + Tomek

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)