In [1]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [4]:

# Load the CSV file into a DataFrame
df = pd.read_csv('pre_processed_data.csv')

# Drop rows where 'admit' column is 0
df = df[df['admit'] != 0]

# Reset index after dropping rows
df.reset_index(drop=True, inplace=True)

In [7]:
# Features (X)
X_user = df[['researchExp', 'industryExp', 'internExp', 'journalPubs', 'confPubs', 'cgpa', 'gre_score', 'univName']]

# Target (y)
y_user = df['userName']

In [8]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'univName' column
X_user['univName'] = label_encoder.fit_transform(X_user['univName'])

# Print the transformed DataFrame
print(X_user)

       researchExp  industryExp  internExp  journalPubs  confPubs    cgpa  \
0                0           18        5.0            0         0  0.8500   
1                0           66        0.0            0         0  0.7828   
2                0            0        0.0            0         0  0.5700   
3                0            0        0.0            0         0  0.6220   
4                0            0        0.0            0         0  0.5200   
...            ...          ...        ...          ...       ...     ...   
25827            0            0        0.0            0         0  0.7400   
25828            0            0        0.0            0         0  0.8200   
25829            0            0        0.0            0         0  0.8400   
25830            0            0        0.0            0         0  0.7200   
25831            0            0        0.0            0         0  0.9160   

       gre_score  univName  
0            276        53  
1            276 

In [9]:
from scipy.stats import pearsonr


# Compute the correlation matrix
correlation_matrix = X_user.corr()

# Replace NaN values with 0 (if any)
correlation_matrix = correlation_matrix.fillna(0)

# Print the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)


Correlation Matrix:
             researchExp  industryExp  internExp  journalPubs  confPubs  \
researchExp     1.000000     0.154760   0.185785     0.227899  0.393235   
industryExp     0.154760     1.000000   0.230582     0.114637  0.129604   
internExp       0.185785     0.230582   1.000000     0.175792  0.144875   
journalPubs     0.227899     0.114637   0.175792     1.000000  0.298214   
confPubs        0.393235     0.129604   0.144875     0.298214  1.000000   
cgpa            0.011375    -0.007821   0.006393     0.004468  0.019463   
gre_score       0.024903    -0.043140   0.016955     0.018804  0.034626   
univName       -0.004072     0.003344   0.003629    -0.002442 -0.005425   

                 cgpa  gre_score  univName  
researchExp  0.011375   0.024903 -0.004072  
industryExp -0.007821  -0.043140  0.003344  
internExp    0.006393   0.016955  0.003629  
journalPubs  0.004468   0.018804 -0.002442  
confPubs     0.019463   0.034626 -0.005425  
cgpa         1.000000   0.069650  

In [11]:
###KNN
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
import joblib

# Fill NaN and infinity values with 0
X_user.fillna(0, inplace=True)
X_user.replace([np.inf, -np.inf], 0, inplace=True)
y_user.fillna(0, inplace=True)
y_user.replace([np.inf, -np.inf], 0, inplace=True)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_user, y_user, test_size=0.2, random_state=42)

y_train.to_csv('./djangoApp/gradvisor/y_train.csv', index=False)

# Define KNN model
knn = KNeighborsRegressor(n_neighbors=3)

# Fit KNN model on training data
knn.fit(X_train, y_train)

# Save the trained model
joblib.dump(knn, './djangoApp/gradvisor/knn_model.pkl')

# Compute similarity between test user and each user in the training data
top_similar_users = []
for i in range(len(X_test)):
    distances, indices = knn.kneighbors([X_test.iloc[i]], n_neighbors=3)
    top_similar_users.append([(idx, y_train.iloc[idx], 1 - distance) for distance, idx in zip(distances[0], indices[0])])

# Print the top 3 similar users for each test user
for test_user, similar_users in zip(y_test, top_similar_users):
    print(f"Test User: {test_user}, Top 3 Similar Users:")
    for idx, (user_idx, similarity, distance) in enumerate(similar_users):
        print(f"   {idx+1}. User Index: {user_idx}, Similarity: {similarity}, Distance: {distance}")


Test User: sai163, Top 3 Similar Users:
   1. User Index: 14756, Similarity: Bontrey, Distance: 0.9955
   2. User Index: 15548, Similarity: vickybogs, Distance: 0.992
   3. User Index: 8742, Similarity: tachyon11, Distance: 0.991
Test User: anand91, Top 3 Similar Users:
   1. User Index: 11654, Similarity: enthusiastic, Distance: 0.9996999999999999
   2. User Index: 2096, Similarity: vijay4454, Distance: 0.9996999999999999
   3. User Index: 9713, Similarity: nehapawar, Distance: 0.9996999999999999
Test User: Taruna_1, Top 3 Similar Users:
   1. User Index: 2589, Similarity: siddhudb, Distance: 0.998
   2. User Index: 10335, Similarity: kiranchitturi, Distance: 0.997
   3. User Index: 6999, Similarity: ajay cj, Distance: 0.997
Test User: raswashere, Top 3 Similar Users:
   1. User Index: 12224, Similarity: niran3090, Distance: -0.4144907210724289
   2. User Index: 48, Similarity: eeevk, Distance: -0.41477913470619154
   3. User Index: 16044, Similarity: ashwinipatil9a, Distance: -0.4198