# KNN

In [47]:
import os
import sys
import numpy as np
import time
import joblib
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
sys.path.append("..")
from data_preprocess import load_training_data

def remove_outliers(df, feature_names):
    clean_df = df.copy()
    for feature in feature_names:
        Q1 = clean_df[feature].quantile(0.1)
        Q3 = clean_df[feature].quantile(0.9)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        clean_df = clean_df[(clean_df[feature] >= lower_bound) & (clean_df[feature] <= upper_bound)]
    return clean_df

# Get the data
X_train, y_train = load_training_data('data/train.csv')

# Select only the specified features
# alcohol
# sulphates
# total sulfur dioxide
# volatile acidity
# density
# chlorides
# residual sugar
# fixed acidity
# pH
# citric acid
# free sulfur dioxide
features = ['alcohol', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
X_train = X_train[features]

#X_train = remove_outliers(X_train, features)

# Standardize features instead of just normalizing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Initialize variables
n_neighbors_values = [5, 10, 15, 20, 25, 50, 75, 100, 150]
#n_neighbors_values = [3, 5, 7]
weights_options = ['uniform', 'distance']
algorithm_options = ['auto', 'ball_tree', 'kd_tree', 'brute']
distance_options = [1,2]
results = {}
best_model = None
best_score = -np.inf
best_config = None

# Define 5-fold cross validation test harness
kfold = KFold(n_splits=5, shuffle=True)

# Perform 5-fold cross validation
for n_neighbors in n_neighbors_values:
    for weights in weights_options:
        for algorithm in algorithm_options:
            for p in distance_options:
                start_time = time.time()

                # Define the model
                model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)

                # Perform cross-validation manually to get the accuracy
                cv_results = []
                for train_index, test_index in kfold.split(X_train):
                    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
                    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
                    model.fit(X_train_fold, y_train_fold)
                    score = model.score(X_test_fold, y_test_fold)
                    cv_results.append(score)

                # Average cross-validation score
                average_score = np.mean(cv_results)
                end_time = time.time()
                elapsed_time = end_time - start_time
                config = (n_neighbors, weights, algorithm, p)

                # Update best model if current model is better
                if average_score > best_score:
                    best_score = average_score
                    best_model = model
                    best_config = config

                # Save the model
                # model_directory = "model_validation"
                # if not os.path.exists(model_directory):
                #     os.makedirs(model_directory)
                # model_filename = f"{model_directory}/knn_model_{n_neighbors}_{weights}_{algorithm}_{p}.joblib"
                # joblib.dump(model, model_filename)

                print("Config: {}\nCross-validation mean accuracy: {:.2f}%\nElapsed time: {:.2f} seconds\n".format(
                    config, average_score * 100, elapsed_time))

                results[config] = average_score


Config: (5, 'uniform', 'auto', 1)
Cross-validation mean accuracy: 57.20%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'auto', 2)
Cross-validation mean accuracy: 59.61%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'ball_tree', 1)
Cross-validation mean accuracy: 57.82%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'ball_tree', 2)
Cross-validation mean accuracy: 57.91%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'kd_tree', 1)
Cross-validation mean accuracy: 58.27%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'kd_tree', 2)
Cross-validation mean accuracy: 59.87%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'brute', 1)
Cross-validation mean accuracy: 56.84%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'brute', 2)
Cross-validation mean accuracy: 57.91%
Elapsed time: 0.04 seconds

Config: (5, 'distance', 'auto', 1)
Cross-validation mean accuracy: 62.56%
Elapsed time: 0.01 seconds

Config: (5, 'distance', 'auto', 2)
Cross-validation mean accuracy: 62.37

## Model's paramemters with the best accuracy

In [48]:
# Output the best model details
print("Best Model Configuration: {}, Cross-validation mean accuracy: {:.2f}%".format(best_config, best_score * 100))
# Optionally save the best model
# best_model_filename = f"{model_directory}/best_knn_model.joblib"
# joblib.dump(best_model, best_model_filename)
best_model.fit(X_train, y_train)
# Find the best parameters
# best_params = results.loc[results['accuracy'].idxmax()]
# print(f"Best parameters: {best_params}")

# # Train the final model with the best parameters
# final_model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'], 
#                             weights=best_params['weights'], 
#                             algorithm=best_params['algorithm'], 
#                             p=best_params['p'], )

# final_model.fit(X_train, y_train)

# # Save the final model
joblib.dump(best_model, 'models/knn.joblib')

Best Model Configuration: (100, 'distance', 'brute', 1), Cross-validation mean accuracy: 67.29%


['models/knn.joblib']

## Acuracy on the test set

In [49]:
from data_preprocess import load_test_data
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib

# Load the test data
X_test, y_test = load_test_data('data/test.csv')

features = ['alcohol', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
X_test = X_test[features]

# Preprocess the test data
scaler = StandardScaler()

X_test = scaler.fit_transform(X_test)

# Load the trained model
# Assuming you want to load the best model previously saved
# model_directory = "model_validation"
# best_model_filename = f"{model_directory}/best_knn_model.joblib"
# model = joblib.load(best_model_filename)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}%".format(accuracy * 100))


Accuracy on test data: 60.42%
