In [1]:
import pandas as pd

# Load the dataset
file_path = "/kaggle/input/cf-data/codeforces_user_data_with_category_counts.csv"
df = pd.read_csv(file_path)

# Display basic information
#df.info()
#df.head()
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Drop 'User ID' since it's non-numeric
df = df.drop(columns=['User ID'])

# Define features and target
X = df.drop(columns=['Current Job Status'])  # Features
y = df['Current Job Status']  # Target variable

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.cluster import KMeans
import numpy as np

# Define KNN classifier
knn = KNeighborsClassifier(n_neighbors=4)

# Train the model
knn.fit(X_train, y_train)

# Predict on the test set
y_pred = knn.predict(X_test)

# Compute Inertia using K-Means (since KNN itself doesn't have inertia)
kmeans = KMeans(n_clusters=len(set(y_test)), random_state=42, n_init=10)
kmeans.fit(X_test)
inertia = kmeans.inertia_

# Compute Silhouette Score (measures cluster separation)
silhouette = silhouette_score(X_test, y_pred)

# Compute Adjusted Rand Index (measures clustering agreement)
ari = adjusted_rand_score(y_test, y_pred)

# Print evaluation metrics
print(f"Inertia (Within-Cluster Sum of Squares): {inertia:.2f}")
print(f"Silhouette Score: {silhouette:.2f}")
print(f"Adjusted Rand Index (ARI): {ari:.2f}")


Inertia (Within-Cluster Sum of Squares): 1560.46
Silhouette Score: 0.07
Adjusted Rand Index (ARI): 0.48


In [3]:
import numpy as np
from scipy.spatial.distance import euclidean

use_id = 3
# Select only topic columns (excluding ID, job status, etc.)
topic_columns = df.columns[12:]  # Problem-solving categories
df_topics = df[topic_columns]
# Define a sample target user (replace this with actual user data)
target_user = df_topics.iloc[use_id].values  # Taking the first user as an example

# Compute Euclidean distances to all users
df["Distance"] = df_topics.apply(lambda row: euclidean(row.values, target_user), axis=1)

# Sort users by closest distance
df_sorted = df.sort_values(by="Distance").reset_index(drop=True)
# Get the closest user (excluding the target user itself)
closest_user = df_sorted.iloc[1][topic_columns]  # Second row (first is itself)

weak_topics = (closest_user - target_user) > 10  # Threshold: Difference of 5 problems

suggested_topics = topic_columns[weak_topics]
df2=pd.read_csv(file_path)
print("for user :",df2.iloc[use_id,0])
print("Recommended Topics for Improvement:", list(suggested_topics))

for user : _c_k_r_
Recommended Topics for Improvement: ['solved_1200_1400', 'greedy', 'binary search', 'strings']
