In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the CSV file into a DataFrame
df = pd.read_csv('pre_processed_data.csv')

# Drop specified columns from DataFrame
X = df.drop(columns=['userName', 'major', 'univName', 'admit', 'greV', 'greQ'])

# Assign 'univName' column to y
y = df["univName"]

# Print the number of unique universities
print("Number of unique universities:", df['univName'].nunique())


Number of unique universities: 54


In [None]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)# stratify=y)
X_train[np.isinf(X_train)] = np.nan
X_train.fillna(X_train.mean(), inplace=True)  # Replace NaNs with mean

# Step 17: Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10]
}


# Step 18: Perform GridSearchCV to find the best hyperparameters
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Step 19: Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Step 20: Model evaluation (continued)
# Get the predicted probabilities for each class
y_pred_proba = best_model.predict_proba(X_test)

# Get the classes
classes = best_model.classes_

# Initialize a list to store top five precision predictions and usernames for each row
top_five_predictions_with_username = []

# Iterate over each row in y_pred_proba
for i, row in enumerate(y_pred_proba):
    # Get the indices of the top five classes with the highest probabilities
    top_five_indices = np.argsort(row)[::-1][:5]
    
    # Get the corresponding classes and probabilities
    top_five_classes = classes[top_five_indices]
    top_five_probs = row[top_five_indices]
    
    # Find the row index in the original DataFrame df that matches the current row in X_test
    original_index = X_test.index[i]
    
    # Find the username corresponding to the original index
    username = df.loc[original_index, 'userName']
    
    # Store the top five precision predictions and username for this row
    top_five_predictions_with_username.append((username, list(zip(top_five_classes, top_five_probs))))

# Print the top five precision predictions and username for each row
for username, prediction in top_five_predictions_with_username:
    print('\n',f"Username: {username}, Top five precision predictions: {prediction}")




 Username: sai163, Top five precision predictions: [('Cornell University', 0.3221498640309508), ('University of Southern California', 0.16236148691043278), ('University of Illinois Urbana-Champaign', 0.1117374682675841), ('Carnegie Mellon University', 0.08511802272824266), ('Columbia University', 0.07514239554528684)]

 Username: anand91, Top five precision predictions: [('North Carolina State University', 0.08925016037757805), ('University of Minnesota Twin Cities', 0.08162701557060881), ('University of Florida', 0.07743097606344854), ('Ohio State University Columbus', 0.05764507689812985), ('Purdue University', 0.05419916417785232)]

 Username: Taruna_1, Top five precision predictions: [('University of Utah', 0.3725909501425643), ('University of Texas Arlington', 0.32843351954810635), ('University of Texas Dallas', 0.09209834390046329), ('Northeastern University', 0.02506344470767651), ('Arizona State University', 0.022832931802299606)]

 Username: raswashere, Top five precision pre

In [5]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100],
    'max_depth': [10],
    'min_samples_split': [10]
}

# Perform GridSearchCV to find the best hyperparameters
rf_model = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best hyperparameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate top-5 accuracy on the test set
y_pred_proba = best_model.predict_proba(X_test)
classes = best_model.classes_

correct_top_5_count = 0

for i, row in enumerate(y_pred_proba):
    top_five_indices = np.argsort(row)[::-1][:5]
    top_five_classes = classes[top_five_indices]
    actual_university = y_test.iloc[i]
    
    if actual_university in top_five_classes:
        correct_top_5_count += 1

top_5_accuracy = correct_top_5_count / len(y_test)

print(f"Top-5 Accuracy: {top_5_accuracy:.2%}")




Top-5 Accuracy: 86.08%
