In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [30]:

# # Load the CSV file into a DataFrame
# df = pd.read_csv('pre_processed_data.csv')

# # Drop specified columns from DataFrame
# X = df.drop(columns=['userName', 'major', 'univName', 'admit', 'greV', 'greQ'])

# # Assign 'univName' column to y
# y = df["univName"]

# # Print the number of unique universities
# print("Number of unique universities:", df['univName'].nunique())

# Load data from the first CSV file into a DataFrame
df1 = pd.read_csv('pre_processed_data.csv')

# Load data from the second CSV file into another DataFrame
df2 = pd.read_csv('generated_samples_extended_univ.csv')

# Concatenate or merge the two DataFrames
combined_df = pd.concat([df1, df2])

# Proceed with the remaining code using the combined DataFrame
# Drop specified columns from DataFrame
X = combined_df.drop(columns=['userName', 'major', 'univName', 'admit', 'greV', 'greQ'])

# Assign 'univName' column to y
y = combined_df["univName"]

# Print the number of unique universities
print("Number of unique universities:", combined_df['univName'].nunique())


Number of unique universities: 54


In [31]:
X.head()

Unnamed: 0,researchExp,industryExp,internExp,journalPubs,confPubs,cgpa,gre_score
0,0,18,5.0,0,0,0.85,276
1,0,66,0.0,0,0,0.7828,276
2,0,0,0.0,0,0,0.57,276
3,0,0,0.0,0,0,0.622,276
4,0,0,0.0,0,0,0.52,276


In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np
import pandas as pd


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train[np.isinf(X_train)] = np.nan
X_train.fillna(X_train.mean(), inplace=True)  # Replace NaNs with mean

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],  # Different values for the number of estimators
    'learning_rate': [0.01, 0.1, 1.0]  # Different values for the learning rate
}

# Perform GridSearchCV to find the best hyperparameters
ada_model = AdaBoostClassifier(random_state=42)
grid_search = GridSearchCV(ada_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Model evaluation
# Get the predicted probabilities for each class
y_pred_proba = best_model.predict_proba(X_test)

# Get the classes
classes = best_model.classes_

# Initialize a list to store top five precision predictions and usernames for each row
top_five_predictions_with_username = []

# Iterate over each row in y_pred_proba
for i, row in enumerate(y_pred_proba):
    # Get the indices of the top five classes with the highest probabilities
    top_five_indices = np.argsort(row)[::-1][:5]
    
    # Get the corresponding classes and probabilities
    top_five_classes = classes[top_five_indices]
    top_five_probs = row[top_five_indices]
    
    # Find the row index in the original DataFrame df that matches the current row in X_test
    original_index = X_test.index[i]
    
    # Find the username corresponding to the original index
    username = df.loc[original_index, 'userName']
    
    # Store the top five precision predictions and username for this row
    top_five_predictions_with_username.append((username, list(zip(top_five_classes, top_five_probs))))

# Print the top five precision predictions and username for each row
for username, prediction in top_five_predictions_with_username:
    print('\n',f"Username: {username}, Top five precision predictions: {prediction}")



 Username: talktorohit54, Top five precision predictions: [('University of Southern California', 0.07162242763644533), ('Carnegie Mellon University', 0.0586971748321303), ('Georgia Institute of Technology', 0.05010224760468379), ('North Carolina State University', 0.049079486301259995), ('Ohio State University Columbus', 0.0437969056749326)]

 Username: deedhero, Top five precision predictions: [('University of Southern California', 0.07162242763644533), ('Carnegie Mellon University', 0.0586971748321303), ('Georgia Institute of Technology', 0.05010224760468379), ('North Carolina State University', 0.049079486301259995), ('Ohio State University Columbus', 0.0437969056749326)]

 Username: shreyansh219, Top five precision predictions: [('University of Florida', 0.06491892380317116), ('University of Southern California', 0.06294610936470238), ('Carnegie Mellon University', 0.05192535894416434), ('North Carolina State University', 0.049734227985248784), ('Georgia Institute of Technology', 

In [33]:
# Initialize a variable to count the correct predictions within top 5
correct_top_5_count = 0

# Iterate over each row in y_pred_proba
for i, row in enumerate(y_pred_proba):
    # Get the true label of the current instance
    true_label = y_test.iloc[i]
    
    # Get the indices of the top five classes with the highest probabilities
    top_five_indices = np.argsort(row)[::-1][:5]
    
    # Get the corresponding classes
    top_five_classes = classes[top_five_indices]
    
    # Check if the true label is within the top five predicted labels
    if true_label in top_five_classes:
        correct_top_5_count += 1

# Calculate the top-5 accuracy
top_5_accuracy = correct_top_5_count / len(y_test)

# Print the top-5 accuracy
print(f"Top-5 Accuracy: {top_5_accuracy:.2%}")


Top-5 Accuracy: 78.00%


In [11]:
import joblib

# Assuming you have trained your model and named it best_model

# Save the trained model to a file
joblib.dump(best_model, './djangoApp/gradvisor/adaboost_model.pkl')

In [35]:
from sklearn.metrics import ndcg_score
from itertools import takewhile

# Define a function to calculate Recall@k
def recall_at_k(y_true, y_pred, k):
    # Iterate over each sample's true label and predicted labels
    recalls = []
    for true_label, pred_labels in zip(y_true, y_pred):
        # Check if the true label is in the top-k predictions
        recall = np.sum(true_label in pred_labels[:k]) / np.sum(true_label == true_label)
        recalls.append(recall)
    # Average recall over all samples
    return np.mean(recalls)

# Define a function to calculate MRR@k (Mean Reciprocal Rank)
def mrr_at_k(y_true, y_pred, k):
    mrrs = []
    for true_label, pred_labels in zip(y_true, y_pred):
        # Find the position of the first correct prediction
        position = next((i for i, label in enumerate(pred_labels[:k]) if label == true_label), None)
        if position is not None:
            # Calculate reciprocal rank
            mrr = 1 / (position + 1)
            mrrs.append(mrr)
    # Average MRR over all samples
    return np.mean(mrrs)

# Define a function to calculate HIT@k (Hit Rate)
def hit_at_k(y_true, y_pred, k):
    hits = []
    for true_label, pred_labels in zip(y_true, y_pred):
        # Check if the true label is in the top-k predictions
        hit = 1 if true_label in pred_labels[:k] else 0
        hits.append(hit)
    # Average HIT over all samples
    return np.mean(hits)

# Define a function to calculate Precision@k
def precision_at_k(y_true, y_pred, k):
    precisions = []
    for true_label, pred_labels in zip(y_true, y_pred):
        # Count the number of correct predictions in top-k
        num_correct = sum(1 for label in pred_labels[:k] if label == true_label)
        # Calculate precision
        precision = num_correct / k
        precisions.append(precision)
    # Average precision over all samples
    return np.mean(precisions)

# Get the top 5 predictions for each sample
top_five_predictions = [list(map(lambda x: x[0], pred)) for _, pred in top_five_predictions_with_username]

# Calculate and print the metrics
recall_5 = recall_at_k(y_test, top_five_predictions, 5)
#ndcg_5 = ndcg_at_k(y_test, top_five_predictions, 5)
mrr_5 = mrr_at_k(y_test, top_five_predictions, 5)
hit_5 = hit_at_k(y_test, top_five_predictions, 5)
precision_5 = precision_at_k(y_test, top_five_predictions,5)

print(f"Recall@5: {recall_5:.4f}")
#print(f"NDCG@5: {ndcg_5:.4f}")
print(f"MRR@5: {mrr_5:.4f}")
print(f"HIT@5: {hit_5:.4f}")
print(f"Precision@5: {precision_5:.4f}")


Recall@5: 0.7800
MRR@5: 0.7300
HIT@5: 0.7800
Precision@5: 0.1560
