# KNN

In [31]:
import numpy as np
import time
import joblib
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from data_preprocess import load_training_data
from sklearn.metrics import f1_score

def remove_outliers(df, feature_names):
    '''
    For each of the features from the dataset, remove the outliers.
    
    Parameters:
    df: DataFrame
    feature_names: feature names

    Returns:
    cleaned_df: cleaned dataframe
    '''
    clean_df = df.copy()
    for feature in feature_names:
        Q1 = clean_df[feature].quantile(0.1)
        Q3 = clean_df[feature].quantile(0.9)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        clean_df = clean_df[(clean_df[feature] >= lower_bound) & (clean_df[feature] <= upper_bound)]
    return clean_df

# Get the data
X_train, y_train = load_training_data('data/train.csv')

# Select only the specified features
features = ['alcohol', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
X_train = X_train[features]

#X_train = remove_outliers(X_train, features)

# Standardize features instead of just normalizing
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Initialize variables
n_neighbors_values = [5, 10, 15, 20, 25, 50, 75, 100, 150]
weights_options = ['uniform', 'distance']
algorithm_options = ['auto', 'ball_tree', 'kd_tree', 'brute']
distance_options = [1,2]
results = {}
best_model = None
best_score = -np.inf
best_config = None

# Define 5-fold cross validation test harness
kfold = KFold(n_splits=5, shuffle=True)

# Perform 5-fold cross validation
for n_neighbors in n_neighbors_values:
    for weights in weights_options:
        for algorithm in algorithm_options:
            for p in distance_options:
                start_time = time.time()

                # Define the model
                model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm, p=p)

                # Perform cross-validation manually to get the accuracy
                cv_results = []
                for train_index, test_index in kfold.split(X_train):
                    X_train_fold, X_test_fold = X_train[train_index], X_train[test_index]
                    y_train_fold, y_test_fold = y_train[train_index], y_train[test_index]
                    model.fit(X_train_fold, y_train_fold)
                    score = model.score(X_test_fold, y_test_fold)
                    cv_results.append(score)

                # Average cross-validation score
                average_score = np.mean(cv_results)
                end_time = time.time()
                elapsed_time = end_time - start_time
                config = (n_neighbors, weights, algorithm, p)

                # Update best model if current model is better
                if average_score > best_score:
                    best_score = average_score
                    best_model = model
                    best_config = config

                print("Config: {}\nCross-validation mean accuracy: {:.2f}%\nElapsed time: {:.2f} seconds\n".format(
                    config, average_score * 100, elapsed_time))

                results[config] = average_score


Config: (5, 'uniform', 'auto', 1)
Cross-validation mean accuracy: 58.00%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'auto', 2)
Cross-validation mean accuracy: 58.89%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'ball_tree', 1)
Cross-validation mean accuracy: 57.37%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'ball_tree', 2)
Cross-validation mean accuracy: 58.27%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'kd_tree', 1)
Cross-validation mean accuracy: 58.89%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'kd_tree', 2)
Cross-validation mean accuracy: 57.64%
Elapsed time: 0.02 seconds

Config: (5, 'uniform', 'brute', 1)
Cross-validation mean accuracy: 58.00%
Elapsed time: 0.03 seconds

Config: (5, 'uniform', 'brute', 2)
Cross-validation mean accuracy: 57.91%
Elapsed time: 0.04 seconds

Config: (5, 'distance', 'auto', 1)
Cross-validation mean accuracy: 63.71%
Elapsed time: 0.01 seconds

Config: (5, 'distance', 'auto', 2)
Cross-validation mean accuracy: 63.36

## Model's paramemters with the best accuracy

In [32]:
# Output the best model details
print("Best Model Configuration: {}, Cross-validation mean accuracy: {:.2f}%".format(best_config, best_score * 100))
# Train the final model with the best parameters
best_model.fit(X_train, y_train)
# Save the final model
joblib.dump(best_model, 'models/knn.joblib')

Best Model Configuration: (20, 'distance', 'auto', 2), Cross-validation mean accuracy: 67.20%


['models/knn.joblib']

## Acuracy on the test set

In [35]:
from data_preprocess import load_test_data
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import joblib

# Load the test data
X_test, y_test = load_test_data('data/test.csv')

features = ['alcohol', 'sulphates', 'total sulfur dioxide', 'volatile acidity']
X_test = X_test[features]

# Preprocess the test data
scaler = StandardScaler()

X_test = scaler.fit_transform(X_test)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on test data: {:.2f}%".format(accuracy * 100))


Accuracy on test data: 66.04%


In [37]:
# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the accuracy and F1 score
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

F1 Score: 0.6484396551724138
Accuracy: 0.6604166666666667
