In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

def load_data(file_path, is_train=True):
    data = pd.read_csv(file_path)
    if is_train:
        X = data.iloc[:, 1:].values  # Features (assuming all columns are features)
        y = data.iloc[:, 0].values   # Target variable (assuming first column is label)
        return X, y
    else:
        X = data.values  # Features (assuming all columns are features)
        return X

def normalise(X):
    X_mean = X.mean(axis=0)
    X_std = X.std(axis=0)
    # Handling zero standard deviation to avoid division by zero
    zero_std_mask = X_std == 0
    X_normalised = np.zeros_like(X)
    X_normalised[:, ~zero_std_mask] = (X[:, ~zero_std_mask] - X_mean[~zero_std_mask]) / X_std[~zero_std_mask]
    X_normalised[:, zero_std_mask] = X[:, zero_std_mask] - X_mean[zero_std_mask]
    return X_normalised

def apply_pca(X_train, X_test, n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

def train_knn(X_train, y_train, n_neighbors=2):
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(X_train, y_train)
    return knn

def predict_knn(knn, X_test):
    return knn.predict(X_test)

def evaluate_model(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

file_path_train = "/kaggle/input/mlprojectdataset/train_tfidf_features.csv"
file_path_test = "/kaggle/input/mlprojectdataset/test_tfidf_features.csv"
X_train, y_train = load_data(file_path_train, is_train=True)
X_test = load_data(file_path_test, is_train=False)

X_train = normalise(X_train)
X_test = normalise(X_test)

pca_components = [2000, 1000, 500, 100]
results = {}

for n in pca_components:
    print(f'Applying PCA with {n} components...')
    
    # Apply PCA
    X_train_pca, X_test_pca = apply_pca(X_train, X_test, n_components=n)
    
    # Train KNN
    knn = train_knn(X_train_pca, y_train, n_neighbors=2)
    
    # Predict on test set
    y_test_pred = predict_knn(knn, X_test_pca)
    
    # Save the predictions to a CSV file
    prediction_file = f"KNN_Prediction_{n}.csv"
    pd.DataFrame(y_test_pred, columns=["label"]).to_csv(prediction_file, index=False)
    
# Submit the predictions to Kaggle to get the Macro F1 score