## Imports

In [121]:
import pandas as pd
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

## Loading Data

In [122]:
df_train = pd.read_csv('data/data_train.csv')
df_test = pd.read_csv('data/data_validation.csv')

In [123]:
X_train = df_train.drop('price_range',axis=1)
y_train = df_train['price_range']

X_test = df_test.drop('price_range',axis=1)
y_test = df_test['price_range']

# Utilities

In [124]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def get_x_test_best_features(n_features, X_train, y_train, X_test):
    selector = SelectKBest(f_classif, k=n_features)
    X_train_selected = selector.fit_transform(X_train, y_train)
    
    input_features = selector.feature_names_in_
    selected_features = selector.get_feature_names_out(input_features=input_features)
    X_test_selected = X_test[selected_features]
    return X_train_selected, X_test_selected, selected_features

# Hyperparameter Tuning - Find Best Parameters [Experiment with Scikit]

In [130]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()

Ns_selected_feature = [i for i in range(1, df_train.shape[1])]
neighbors = [i for i in range(1, 30)]
metrics = ['euclidean', 'manhattan', 'jaccard']
weights = ['distance', 'uniform']
KNN_scikit, N_FEATURES_SELECTED, BEST_SELECTED_FEATURES, BEST_N_NEIGHBORS, BEST_METRIC, BEST_WEIGHT, value = None, None, None, None, None, None, -1

for n in Ns_selected_feature:
    X_train_selected, X_test_selected, selected_features = get_x_test_best_features(n, X_train, y_train, X_test)

    for neighbor in neighbors:
        for metric in metrics:
            for weight in weights:
                KNN = KNeighborsClassifier(n_neighbors=neighbor, metric=metric, weights=weight)
                KNN.fit(X_train_selected,y_train)
                KNN_score = KNN.score(X_test_selected, y_test)           
                if KNN_score > value:
                    KNN_scikit, N_FEATURES_SELECTED, BEST_SELECTED_FEATURES, BEST_N_NEIGHBORS, BEST_METRIC, BEST_WEIGHT, value = KNN, n, selected_features, neighbor, metric, weight, KNN_score

print('BEST parameters: ', KNN_scikit, N_FEATURES_SELECTED, BEST_SELECTED_FEATURES, BEST_N_NEIGHBORS, BEST_METRIC, value)      

BEST parameters:  KNeighborsClassifier(metric='euclidean', n_neighbors=17, weights='distance') 4 ['battery_power' 'px_height' 'px_width' 'ram'] 17 euclidean 0.94


`KNN_scikit`  is the optimal model obtained from the performance tuning process, which was conducted using the Scikit-learn library. This model is saved for subsequent performance evaluations.

## KNN Scratch Model Training

In [131]:
from lib.knn import KNeighborsClassifier

print("Training with KNN from Scratch with parameters: ")
print("Selected features count:", N_FEATURES_SELECTED)
print("Selected features:", BEST_SELECTED_FEATURES)
print("K nearest neighbors:", BEST_N_NEIGHBORS)
print("Metric:", BEST_METRIC)

X_train_selected, X_test_selected, selected_features = get_x_test_best_features(N_FEATURES_SELECTED, X_train, y_train, X_test)
X_test_selected = X_test_selected.values
KNN_scratch = KNeighborsClassifier(n_neighbors=BEST_N_NEIGHBORS, metric=BEST_METRIC, weight=BEST_WEIGHT)
KNN_scratch.fit(X_train_selected,y_train)

Training with KNN from Scratch with parameters: 
Selected features count: 4
Selected features: ['battery_power' 'px_height' 'px_width' 'ram']
K nearest neighbors: 17
Metric: euclidean


## Performance Evaluation

In [132]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score

def evaluate_classifier_performance(prediction, y_test):
    metrics = {
        'Accuracy Average': accuracy_score(y_test, prediction),
        'F1 Macro Average': f1_score(y_test, prediction, average='macro'),
        'F1 Micro Average': f1_score(y_test, prediction, average='micro'),
        'Precision Macro Average': precision_score(y_test, prediction, average='macro', zero_division=0),
        'Precision Micro Average': precision_score(y_test, prediction, average='micro', zero_division=0),
        'Recall Macro Average': recall_score(y_test, prediction, average='macro', zero_division=0),
        'Recall Micro Average': recall_score(y_test, prediction, average='micro', zero_division=0)
    }

    for metric, value in metrics.items():
        print(f'{metric}: {value:.2f}')

### KNN Scikit Performance Evaluation

In [133]:
y_pred_by_scikit = KNN_scikit.predict(X_test_selected)

print('KNN Scikit performances:')
evaluate_classifier_performance(y_pred_by_scikit, y_test)

KNN Scikit performances:
Accuracy Average: 0.94
F1 Macro Average: 0.94
F1 Micro Average: 0.94
Precision Macro Average: 0.94
Precision Micro Average: 0.94
Recall Macro Average: 0.94
Recall Micro Average: 0.94


### KNN Scratch Performance Evaluation

In [134]:
y_pred_by_scratch = KNN_scratch.predict(X_test_selected)

print('KNN Scratch performances:')
evaluate_classifier_performance(y_pred_by_scratch, y_test)

distance
KNN Scratch performances:
Accuracy Average: 0.94
F1 Macro Average: 0.94
F1 Micro Average: 0.94
Precision Macro Average: 0.94
Precision Micro Average: 0.94
Recall Macro Average: 0.94
Recall Micro Average: 0.94


# Test with Data Test

In [136]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from lib.knn import KNeighborsClassifier

df_train = pd.concat([pd.read_csv('data/data_train.csv'), pd.read_csv('data/data_validation.csv')])
df_test = pd.read_csv('data/data_test.csv')
X_train = df_train.drop('price_range',axis=1)
y_train = df_train['price_range'].values
X_test = df_test

X_train_selected, X_test_selected, selected_features = get_x_test_best_features(N_FEATURES_SELECTED, X_train, y_train, X_test)
X_test_selected = X_test_selected.values
KNN = KNeighborsClassifier(n_neighbors=BEST_N_NEIGHBORS, metric=BEST_METRIC, weight=BEST_WEIGHT)
KNN.fit(X_train_selected,y_train)
predictions = df_test['price_range'] = KNN.predict(X_test_selected)
df_test[['id','price_range']].to_csv("submission.csv", index=False)


distance


# Test With Data Test [Parameter Tuning]

In [137]:
from lib.knn import KNeighborsClassifier

df_train = pd.concat([pd.read_csv('data/data_train.csv'), pd.read_csv('data/data_validation.csv')])
df_test = pd.read_csv('data/data_test.csv')
X_train = df_train.drop('price_range',axis=1)
y_train = df_train['price_range'].values
X_test = df_test
X_train_selected, X_test_selected, _ = get_x_test_best_features(4, X_train, y_train, X_test)
X_test_selected = X_test_selected.values
WeightedKNN = KNeighborsClassifier(n_neighbors=2, metric='euclidean', weight='distance')
WeightedKNN.fit(X_train_selected, y_train)
weighted_predictions = df_test['price_range'] = WeightedKNN.predict(X_test_selected)    
df_test[['id','price_range']].to_csv("submission3.csv", index=False)

distance
