In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import custom_implementations as custom_imp
import importlib
#importlib.reload(custom_implementations)

dataset = pd.read_csv('../Datasets/mammographic_masses.txt')
cols = list(dataset)

# Benign: 0
# Malignant: 1

class_index = 5
minority_class = 1
majority_class = 0

# Set normalise
normalise = True

# Imbalance dataset
remove_n = 250
minority_indices = dataset[dataset['severity'] == 1]
drop_indices = np.random.choice(minority_indices.index, remove_n, replace=False)
dataset = dataset.drop(drop_indices)

# Get classification count
severity_1_count = len(dataset[dataset['severity'] == 1])
severity_0_count = len(dataset[dataset['severity'] == 0])

print "No. Features: ", len(dataset.columns)
print "Malignant (1): ",severity_1_count
print "Benign (0): ",severity_0_count
print "Total: ", severity_0_count + severity_1_count

print dataset

# Select minority classes
#minority_samples = dataset[dataset['severity'] == minority_class]

### Plot features

In [None]:
import matplotlib.pyplot as plt

# Separate into features and labels depending upon classification label
x_benign = dataset[dataset['severity'] == 0].iloc[:,0:class_index]
x_malignant = dataset[dataset['severity'] == 1].iloc[:,0:class_index]
y_benign = dataset[dataset['severity'] == 0].iloc[:,-1]
y_malignant = dataset[dataset['severity'] == 1].iloc[:,-1]

if(normalise):
    x_benign = preprocessing.normalize(x_benign)
    x_malignant = preprocessing.normalize(x_malignant)
    plt.plot([x_benign[1]], [x_benign[2]], 'g^', [x_malignant[1]], [x_malignant[2]], 'r*')
else:
    plt.plot([x_benign['age'].values], [x_benign['shape'].values], 'g^', [x_malignant['age'].values], [x_malignant['shape'].values], 'r*')

plt.ylabel('shape')
plt.xlabel('age')
plt.title('age vs shape')
plt.show()

## Find K for KNN

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.model_selection import cross_val_score

# Split into data and labels
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, len(dataset.columns)-1].values

if(normalise):
    x = preprocessing.normalize(x)

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1, test_size=0.25, shuffle=True)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print "Best K: ", k_best

## KNN on imbalanced dataset

In [None]:
# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print 'Best SVM hyperparameters: ', svm_best_params

## SVM on imbalanced dataset

In [None]:
from sklearn import svm

# Fit model
# C: penalty parameter of the error term, i.e. slack variable
# kernel (linear, rbf)
# gamma: kernel coefficient for rbf, poly, sigmoid
# tol: tolerance for stopping criterion
# max_iter: limit on eopochs
# random_state: seed when shuffling

kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply SMOTE

In [None]:
## Use SMOTE on N% of the dataset with k neighbours
smote_percentages = [50,100,200,300]

print 'Percentage to SMOTE: ', smote_percentages[3]
smoted_samples = custom_imp.smote(x_malignant, smote_percentages[3], k_best)
print 'Number of synthetic samples SMOTEd: ', len(smoted_samples)
updated_x_malignant = np.concatenate((x_malignant, smoted_samples), axis=0)
print 'Number of minority after: %s' % len(updated_x_malignant)

if(normalise):
    plt.plot([x_benign[1]], [x_benign[2]], 'g^', [updated_x_malignant[1]], [updated_x_malignant[2]], 'r*')
else:
    plt.plot([x_benign['age'].values], [x_benign['shape'].values], 'g^', [updated_x_malignant['age'].values], [updated_x_malignant['shape'].values], 'r*')

# Update x and y for smote
x_smote = np.concatenate((x, smoted_samples), axis=0)
smote_y = np.full((len(smoted_samples)), minority_class)
y_smote = np.concatenate((y, smote_y), axis=0)

## KNN after SMOTE

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_smote,y_smote,random_state=1, test_size=0.25, shuffle=True)

k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Set SVM Parameters

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

## SVM after SMOTE

In [None]:
from sklearn import svm

kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply Tomek Links

In [None]:
# Declare column variables
classes = 'severity'
class_x = 1
class_y = 2
label_0 = 'Benign' # Change depending on which class 0 belongs to
label_1 = 'Malignant' # Change depending on which class 1 belongs to

num_to_remove = len(x_benign) - len(x_malignant)
if(num_to_remove > len(x_malignant)):
    num_to_remove = len(x_malignant)
    
tomek_dataset = custom_imp.tomek(x_benign, x_malignant, y_benign.values, y_malignant.values, num_to_remove, classes, 1)
tomek_df = pd.DataFrame(tomek_dataset)

# Split into data and labels
x_tomek = tomek_df.iloc[:, :-1].values
y_tomek = tomek_df.iloc[:, len(dataset.columns)-1].values

print sum(tomek_df[5] == 1)
print sum(tomek_df[5] == 0)

x_train, x_test, y_train, y_test = train_test_split(x_tomek,y_tomek,random_state=1, test_size=0.25, shuffle=True)
print 'done'

## KNN after Tomek 

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## SVM after Tomek Links

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

In [None]:
from sklearn import svm

kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## Apply SMOTE + Tomek Links

In [None]:
# Declare column variables
classes = 'severity'
class_x = 1
class_y = 2
label_0 = 'Benign' # Change depending on which class 0 belongs to
label_1 = 'Malignant' # Change depending on which class 1 belongs to

smote_tomek_dataset = custom_imp.smote_tomek(x_benign, x_malignant, y_benign.values, y_malignant.values, majority_class, minority_class, 300, k_best, classes)

# Split into data and labels
smote_tomek_df = pd.DataFrame(smote_tomek_dataset)

x_smote_tomek = smote_tomek_df.iloc[:, :-1].values
y_smote_tomek = smote_tomek_df.iloc[:, len(dataset.columns)-1].values

print sum(smote_tomek_df[5] == 1)
print sum(smote_tomek_df[5] == 0)

x_train, x_test, y_train, y_test = train_test_split(x_smote_tomek,y_smote_tomek,random_state=1, test_size=0.25, shuffle=True)

## KNN after SMOTE + Tomek

In [None]:
k_best = custom_imp.knn_set_hyper_params(x_train, y_train, x_test, y_test, x, y)
print 'Best K: ', k_best

# Create KNN and fit model
model = KNeighborsClassifier(n_neighbors=k_best, metric='minkowski', p=2)
model.fit(x_train, y_train)

# Predict on testing data
y_pred = model.predict(x_test)

# 10-cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("\n10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

class_labels = ['Benign', 'Malignant']
pos_label = 1

custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)

## SVM after SMOTE + Tomek

In [None]:
svm_best_params = custom_imp.svm_set_hyper_params(x_train, y_train, x_test, y_test, True)
print svm_best_params

In [None]:
kernel = svm_best_params['kernel']
if(kernel == 'linear'):
    model = svm.SVC(C=svm_best_params['C'], max_iter=-1, kernel='linear')
else:
    model = svm.SVC(gamma=svm_best_params['gamma'], C=svm_best_params['C'], max_iter=-1, kernel='rbf')
model.fit(x_train, y_train)

# Show SVM params
print model.get_params

# Predict on testing data
y_pred = model.predict(x_test)
print ("Accuracy: %f" %(metrics.accuracy_score(y_test, y_pred)))

# k cross fold validation
scores = cross_val_score(model, x, y, cv=10)
print("10-Fold Cross Validation: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# Get support vectors
print 'Support vectors: \n', model.support_vectors_

print y_pred
custom_imp.show_metrics(y_test, y_pred, class_labels, pos_label)