In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
sns.set_style("dark")

### Partition data

In [6]:
# Load dataset and preview
df = pd.read_csv("project_train.csv")

# Partition data set
train, test = train_test_split(df, test_size=0.2)

train_labels = train['Label']
test_labels = test['Label']

train = train.drop('Label', axis = 1)
test = test.drop('Label', axis = 1)


train_unnormalized = train
# Normalize data in sets
# train = (train - train.min()) / (train.max() - train.min())
# test = (test - test.min()) / (test.max() - test.min())

train = (train - train.mean()) / (train.std())
test = (test - test.mean()) / (test.std())

### k-Nearest Neighbour

In [10]:
# Apply k Nearest Neighbours to assign labels

mean_cv_acc = []
pred_acc = []
number_of_neighbours = range(2,64)

for k in number_of_neighbours:

    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(train, train_labels)

    # cross_validate
    cv_accuracy = cross_val_score(knn, train, train_labels, cv=5)

    mean_accuracy = np.mean(cv_accuracy)
    mean_cv_acc.append(mean_accuracy)

    predicted_labels = knn.predict(test)
    prediction_accuracy = np.sum(predicted_labels==test_labels) / len(test_labels)
    pred_acc.append(prediction_accuracy)
    print("\n"+ "k = " + str(k) +"\ncv acc:" + str(mean_accuracy) + "\npred acc: " + str(prediction_accuracy))

plt.plot(number_of_neighbours, mean_cv_acc, label='CV Accuracy')
plt.plot(number_of_neighbours, pred_acc, label='Validation Accuracy')
plt.ylim([0, 1])
plt.legend()
plt.tight_layout()
plt.xlabel('Number of Neighbours')
plt.ylabel('Prediction Accuracy')


TypeError: can only concatenate str (not &quot;int&quot;) to str

### Support Vector Machine

In [4]:
# Cross-validation and accuracy of the model
def accuracy(clf, data, target):
    results = cross_val_predict(clf, data ,target ,cv = 10)
    return metrics.accuracy_score(target,results)


p1 = Pipeline([('scaler',StandardScaler()),
                  ('clf', svm.SVC())
])

p2 = Pipeline([('scaler',StandardScaler()),
                ('clf', svm.SVC(kernel = 'linear'))               
])

p3 = Pipeline([('scaler', StandardScaler()),
                ('clf', svm.SVC(kernel = 'poly', degree=2))                
 ])

p4 = Pipeline([('min_max_scaler', MinMaxScaler()),
                 ('clf', svm.SVC())               
])

p5 = Pipeline([('min_max_scaler', MinMaxScaler()),
                 ('clf', svm.SVC(kernel= 'linear'))               
])

p6 = Pipeline([('min_max_scaler', MinMaxScaler()),
                  ('clf', svm.SVC(kernel='poly', degree=2))               
])

p7 = Pipeline([('Normalize', Normalizer()),
                  ('clf', svm.SVC() )    
])

p8 = Pipeline([('Normalize', Normalizer()),
                  ('clf', svm.SVC(kernel='linear') )    
])

p9 = Pipeline([('Normalize', Normalizer()),
                  ('clf', svm.SVC(kernel='poly', degree=2) )    
])

# list with pipelines
pipelines = []
pipelines.append(('Scaler_rbf', p1))
pipelines.append(('Scaler_linear', p2))
pipelines.append(('Scaler_poly', p3))
pipelines.append(('Min_max_scaler', p4))
pipelines.append(('Min_max_scaler_linear', p5))
pipelines.append(('Min_max_scaler_poly', p6))
pipelines.append(('Normalizer', p7))
pipelines.append(('Normalizer_linear', p8))
pipelines.append(('Normalizer_poly', p9))

# Function to train and predict models
def model_accuracy(clf_models, data, target):
    for model, pip in clf_models:
        cross_val = cross_val_predict(pip, data, target, cv = 10)
        accuracy = accuracy_score(target, cross_val)               
        print(f'Model: {model} - Accuracy : {accuracy:.4f}')

model_accuracy(pipelines, train_unnormalized, train_labels)

Model: Scaler_rbf - Accuracy : 0.8109
Model: Scaler_linear - Accuracy : 0.7876
Model: Scaler_poly - Accuracy : 0.8083
Model: Min_max_scaler - Accuracy : 0.7953
Model: Min_max_scaler_linear - Accuracy : 0.7798
Model: Min_max_scaler_poly - Accuracy : 0.7902
Model: Normalizer - Accuracy : 0.6788
Model: Normalizer_linear - Accuracy : 0.6684
Model: Normalizer_poly - Accuracy : 0.6813
