In [1]:
import pandas as pd

MBA = '/kaggle/input/mba-admission-dataset/MBA.csv'
df = pd.read_csv(filepath_or_buffer=MBA, index_col=['application_id'])
df['admission'] = df['admission'].fillna(value='Reject')
df['race'] = df['race'].fillna(value='Unknown')
df = pd.get_dummies(data=df, columns=['gender', 'international', 'major', 'race', 'work_industry'])
df.head()

Unnamed: 0_level_0,gpa,gmat,work_exp,admission,gender_Female,gender_Male,international_False,international_True,major_Business,major_Humanities,...,work_industry_Health Care,work_industry_Investment Banking,work_industry_Investment Management,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.3,620.0,3.0,Admit,True,False,True,False,True,False,...,False,False,False,False,False,False,False,False,False,False
2,3.28,680.0,5.0,Reject,False,True,True,False,False,True,...,False,False,True,False,False,False,False,False,False,False
3,3.3,710.0,5.0,Admit,True,False,False,True,True,False,...,False,False,False,False,False,False,False,False,False,True
4,3.47,690.0,6.0,Reject,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,True
5,3.35,590.0,5.0,Reject,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False


We expect the classes in our target variable to be unbalanced; how unbalanced are they?

In [2]:
df['admission'].value_counts(normalize=True).to_dict()

{'Reject': 0.8385534388117533,
 'Admit': 0.14530190506942203,
 'Waitlist': 0.01614465611882467}

Very unbalanced; we should probably have modest expectations for building a highly accurate classifier. A classifier that always predicts Reject will be correct almost 84% of the time.

Let's make a couple of quick visualizations to see what we're up against. We might expect people with high GPAs and high GMAT scores to be more likely to be admitted to an MBA program. Let's use a strip plot to see if that is true.

In [3]:
from plotly import express

express.strip(data_frame=df, x='gpa', y='gmat', color='admission', height=800)

In [4]:
import arrow
from umap import UMAP

time_start = arrow.now()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=201)
df[['x', 'y']] = umap.fit_transform(X=df.drop(columns=['admission']))
print('done with UMAP in {}'.format(arrow.now() - time_start))

UMAP(low_memory=False, n_epochs=201, n_jobs=1, random_state=2024, verbose=True)
Mon Sep 23 18:26:23 2024 Construct fuzzy simplicial set
Mon Sep 23 18:26:23 2024 Finding Nearest Neighbors
Mon Sep 23 18:26:23 2024 Building RP forest with 9 trees
Mon Sep 23 18:26:29 2024 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	Stopping threshold met -- exiting after 4 iterations
Mon Sep 23 18:26:47 2024 Finished Nearest Neighbor Search
Mon Sep 23 18:26:50 2024 Construct embedding


Epochs completed:   0%|            0/201 [00:00]

	completed  0  /  201 epochs
	completed  20  /  201 epochs
	completed  40  /  201 epochs
	completed  60  /  201 epochs
	completed  80  /  201 epochs
	completed  100  /  201 epochs
	completed  120  /  201 epochs
	completed  140  /  201 epochs
	completed  160  /  201 epochs
	completed  180  /  201 epochs
	completed  200  /  201 epochs
Mon Sep 23 18:26:55 2024 Finished embedding
done with UMAP in 0:00:32.381293


In [5]:
from plotly import express

express.scatter(data_frame=df, x='x', y='y', color='admission')

This is an odd looking scatter plot; the data is definitely clustering,but the way it is clustering has nothing to do with the target variable. Let's build a model.

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['x', 'y', 'admission']), df['admission'], test_size=0.2, random_state=2024, stratify=df['admission'])

logreg = LogisticRegression(max_iter=1000, tol=1e-12).fit(X_train, y_train)
print('model fit in {} iterations'.format(logreg.n_iter_[0]))
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=logreg.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=logreg.predict(X=X_test), zero_division=0))

model fit in 750 iterations
accuracy: 0.8337
f1: 0.7940
              precision    recall  f1-score   support

       Admit       0.43      0.15      0.22       180
      Reject       0.86      0.97      0.91      1039
    Waitlist       0.00      0.00      0.00        20

    accuracy                           0.83      1239
   macro avg       0.43      0.37      0.38      1239
weighted avg       0.78      0.83      0.79      1239



As expected the model is almost always predicting Reject, and is not really performing better than a dummy model.

In [7]:
express.histogram(x=df.drop(columns=['x', 'y', 'admission']).columns, y=logreg.coef_[0])

In [8]:
from sklearn.ensemble import AdaBoostClassifier

boost = AdaBoostClassifier(algorithm='SAMME', random_state=2024).fit(X_train, y_train)
print('accuracy: {:5.4f}'.format(accuracy_score(y_true=y_test, y_pred=boost.predict(X=X_test))))
print('f1: {:5.4f}'.format(f1_score(average='weighted', y_true=y_test, y_pred=boost.predict(X=X_test), zero_division=0)))
print(classification_report(y_true=y_test, y_pred=boost.predict(X=X_test), zero_division=0))

accuracy: 0.8224
f1: 0.8094
              precision    recall  f1-score   support

       Admit       0.42      0.36      0.39       180
      Reject       0.88      0.92      0.90      1039
    Waitlist       0.00      0.00      0.00        20

    accuracy                           0.82      1239
   macro avg       0.43      0.42      0.43      1239
weighted avg       0.80      0.82      0.81      1239



We can do every so slightly better with AdaBoost, but the dummy model would still be more accurate.