In [13]:
### Part 0: library imports and macro variables

import pandas as pd # used mainly for data handling and operations
import matplotlib.pyplot as plt # used to plot the k accuracy results
from sklearn import preprocessing, metrics # preprocessing tools for the normalisers
from sklearn.impute import SimpleImputer # imputer used during preprocessing
from sklearn.model_selection import train_test_split # split the training data into train and test sets
from sklearn.neighbors import KNeighborsClassifier # implementation of the knn algorithm

categorical_columns = ['job', 'marital', 'education', 'default', 'housing', 
                       'loan', 'contact', 'month', 'day_of_week', 'poutcome'] # used to segregate the columns 
# that needed numerical conversion

SOLUTION_ROWS = 8238 # macro variable to ennumerate the final csv correctly
pd.set_option('mode.chained_assignment', None)

In [14]:
### Part 1: importing and preprocessing data

# 1.1 Import training and testing sets, create new csv files
training_set_file = pd.read_csv('Assignment3_TrainingSet.csv')
testing_set_file = pd.read_csv('Assignment3_TestingSet.csv')


# 1.2 Delete marital rows with unknown values for the training set, for cleaner delivery
altered_training_set_file = training_set_file[(training_set_file['marital'] != 'unknown') | 
                                              (training_set_file['job'] != 'unknown')]
altered_testing_set_file = testing_set_file
                                            

# 1.3 Define generic functions for value imputation and normalisation

def impute_values(column_name, dataset_file):
    imputer = SimpleImputer(strategy='most_frequent', missing_values='unknown')
    imputer = imputer.fit(dataset_file[[column_name]])
    dataset_file[column_name] = \
        imputer.transform(dataset_file[[column_name]]).ravel() 
    

def normalise_values(column_name, isMinMax, dataset_file):
    scaler = preprocessing.MinMaxScaler() if isMinMax else preprocessing.StandardScaler()
    scaler = scaler.fit(dataset_file[[column_name]])
    dataset_file[column_name] = scaler.transform(dataset_file[[column_name]]).ravel()
    
    
def categorical_to_numerical(dataset_file):
    for column in categorical_columns:
        dataset_file[column] = altered_training_set_file[column].astype('category')
        
    cat_columns = dataset_file.select_dtypes(['category']).columns
    dataset_file[cat_columns] = dataset_file[cat_columns].apply(lambda x: x.cat.codes)


# 1.4 Value imputation through simple imputers on education, housing and loan for the training set
# In the case of the testing set, no tuples could be erased so 'marital' and 'job' were also imputed

impute_values('education', altered_testing_set_file)
impute_values('housing', altered_testing_set_file)
impute_values('loan', altered_testing_set_file)
impute_values('marital', altered_testing_set_file)
impute_values('job', altered_testing_set_file)

impute_values('education', altered_training_set_file)
impute_values('housing', altered_training_set_file)
impute_values('loan', altered_training_set_file)

# 1.5 Normalise duration, consumer confidence index and consumer price index for both data sets

normalise_values('cons.price.idx', True, altered_testing_set_file)
normalise_values('cons.conf.idx', True, altered_testing_set_file)
normalise_values('duration', False, altered_testing_set_file)

normalise_values('cons.price.idx', True, altered_training_set_file)
normalise_values('cons.conf.idx', True, altered_training_set_file)
normalise_values('duration', False, altered_training_set_file)

# 1.6 Convert all categorical values to numerical on both data sets

categorical_to_numerical(altered_testing_set_file)
categorical_to_numerical(altered_training_set_file)


In [15]:
### Part 2: training set partition and knn model generation


# Part 2.1: separate Final-Y from the rest of the columns
final_y_column = altered_training_set_file['Final_Y']
training_set = altered_training_set_file.drop(columns='Final_Y')


# Part 2.2: create 70/30 partition, shuffling first
x_train, x_final_y, y_train, y_final_y = \
    train_test_split(training_set, final_y_column, shuffle=True, train_size=0.7)

# Part 2.3: find the best k. Use knn for each possible k, write accuracy results compared to testing data and compare
k_range = range(1, 150)
scores = {}
scores_list = []

for k in k_range:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train, y_train)
    y_prediction = knn.predict(x_final_y)
    scores[k] = metrics.accuracy_score(y_final_y, y_prediction)
    scores_list.append(scores[k])


# Part 2.4: k evaluation, plotting relationship between k and testing accuracy
plt.plot(k_range, scores_list)
plt.xlabel('Value of k for KNN')
plt.ylabel('Testing Accuracy')
plt.show()

# Part 2.5: storing ideal k as a variable for part 3
ideal_k = max(scores_list)
ideal_k_index = scores_list.index(ideal_k) + 1



In [174]:
### Part 3: ideal k model implemenation and prediction

# Part 3.1: re-create the model using the ideal k and the entire training set
knn = KNeighborsClassifier(n_neighbors=ideal_k_index)
knn.fit(training_set, final_y_column)

# Part 3.2: create an ascending numeric list for the final predicted results
prediction_row_id = list(range(SOLUTION_ROWS+1))
prediction_row_id.pop(0)

# Part 3.3: use the testing csv file to predict the values with the trained model, 
# and put the results in a new csv file
testing_set = altered_testing_set_file
predicted_testing_set = pd.DataFrame()
predicted_testing_set['row ID'] = prediction_row_id
predicted_testing_set['Final_Y'] = knn.predict(testing_set)
predicted_testing_set.to_csv('Assignment3_Result.csv')
