In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Sample data
# Replace with your dataset
df = pd.read_csv('data/train_tfidf_features.csv')

# Separate features and labels
X = df.drop(columns=['id', 'label'])
y = df['label']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA(n_components=500)  # Reduce to 1000 dimensions, adjust as needed
X_pca = pca.fit_transform(X_scaled)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train the k-NN classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Macro F1 Score: {macro_f1:.2f}')

# Output PCA components for visualization (optional)
print('PCA Components:')
print(X_pca)


Accuracy: 64.24%
Macro F1 Score: 0.54
PCA Components:
[[-0.4318212  -0.18671266 -0.34890395 ... -0.66894625  0.98982921
  -0.06042008]
 [ 0.41632078 -0.02333871  0.11932892 ... -1.10194011  1.48708064
  -0.13697529]
 [ 0.17410117 -0.29091741  0.08949273 ...  0.22600783 -0.52867027
   0.76396896]
 ...
 [ 0.27836429 -0.2422294  -0.5417945  ... -0.41260028  2.59672937
  -1.20860023]
 [-0.10777522 -0.10739739 -0.03980328 ...  0.46834836 -0.36716416
  -0.30179735]
 [-0.4892356  -0.04338433 -0.0978672  ... -0.38149282  1.81966848
  -0.76159285]]
