In [65]:
# Initial imports
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import numpy as np

In [45]:
# Read the CSV file into a Pandas DataFrame
# Set the index using the Ticker column
df = pd.read_csv(
    Path("Resources/MBA.csv"),                
    index_col="application_id"
)

# Review the DataFrame
df.head()

Unnamed: 0_level_0,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,Female,False,3.3,Business,Asian,620,3,Financial Services,Admit
2,Male,False,3.28,Humanities,Black,680,5,Investment Management,
3,Female,True,3.3,Business,,710,5,Technology,Admit
4,Male,False,3.47,STEM,Black,690,6,Technology,
5,Male,False,3.35,STEM,Hispanic,590,5,Consulting,


In [46]:
df["admit_cluster"] = np.where(df["admission"]=="Admit", 2,
                    np.where(df["admission"]=="Waitlist", 1, 0))
df.head()

Unnamed: 0_level_0,gender,international,gpa,major,race,gmat,work_exp,work_industry,admission,admit_cluster
application_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Female,False,3.3,Business,Asian,620,3,Financial Services,Admit,2
2,Male,False,3.28,Humanities,Black,680,5,Investment Management,,0
3,Female,True,3.3,Business,,710,5,Technology,Admit,2
4,Male,False,3.47,STEM,Black,690,6,Technology,,0
5,Male,False,3.35,STEM,Hispanic,590,5,Consulting,,0


In [47]:
# Generate our categorical variable lists and check the number of unique values in each column

data_cat = df.dtypes[df.dtypes == 'object'].index.tolist()
df[data_cat].nunique()

gender            2
major             3
race              5
work_industry    14
admission         2
dtype: int64

In [48]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse_output=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[data_cat]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names_out(data_cat)

encode_df.head()

Unnamed: 0,gender_Female,gender_Male,major_Business,major_Humanities,major_STEM,race_Asian,race_Black,race_Hispanic,race_Other,race_White,...,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology,admission_Admit,admission_Waitlist,admission_nan
0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [49]:
# Merge one-hot encoded features and drop the originals
df = df.merge(encode_df,left_index=True, right_index=True)
df = df.drop(data_cat, axis=1)
df.head()

Unnamed: 0,international,gpa,gmat,work_exp,admit_cluster,gender_Female,gender_Male,major_Business,major_Humanities,major_STEM,...,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology,admission_Admit,admission_Waitlist,admission_nan
1,False,3.3,620,3,2,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,False,3.28,680,5,0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,True,3.3,710,5,2,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,False,3.47,690,6,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,False,3.35,590,5,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [50]:
#scale gpa, gmat, work_exp
scaled_data = StandardScaler().fit_transform(
    df[["gpa", "gmat", "work_exp"]]
)

In [51]:
#add scaled data to df
df_scaled = pd.DataFrame(
    scaled_data,
    columns=["gpa_scaled", "gmat_scaled", "work_exp_scaled"]
)
df_scaled.head()

Unnamed: 0,gpa_scaled,gmat_scaled,work_exp_scaled
0,0.325213,-0.630759,-1.953905
1,0.193235,0.586406,-0.01658
2,0.325213,1.194989,-0.01658
3,1.447022,0.789267,0.952083
4,0.655157,-1.239341,-0.01658


In [54]:
scaled_admissions_df = df_scaled.merge(df, left_index=True, right_index=True)
scaled_admissions_df = scaled_admissions_df.drop(columns=['gpa', 'gmat', 'work_exp'])
scaled_admissions_df.head()

Unnamed: 0,gpa_scaled,gmat_scaled,work_exp_scaled,international,admit_cluster,gender_Female,gender_Male,major_Business,major_Humanities,major_STEM,...,work_industry_Media/Entertainment,work_industry_Nonprofit/Gov,work_industry_Other,work_industry_PE/VC,work_industry_Real Estate,work_industry_Retail,work_industry_Technology,admission_Admit,admission_Waitlist,admission_nan
1,0.193235,0.586406,-0.01658,False,2,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.325213,1.194989,-0.01658,False,0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,1.447022,0.789267,0.952083,True,2,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.655157,-1.239341,-0.01658,False,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,-0.466653,-0.833619,0.952083,False,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [58]:
import hvplot.pandas

In [59]:

scaled_admissions_df.hvplot.scatter(
    x="gpa_scaled",
    y="gmat_scaled",
    by="admit_cluster"
)

PCA - reduce features

In [61]:
#Reduce number of features to two principal components
#create PCA model instance where n_components = 2
pca = PCA(n_components=2)

In [62]:
#fit scaled_admissions_df data to the PCA
admissions_pca = pca.fit_transform(scaled_admissions_df)

admissions_pca[:5]

array([[ 0.52224263, -0.04715818],
       [ 1.23523242, -0.01486924],
       [ 1.53434008,  0.92920451],
       [-0.44611008, -0.01305631],
       [-0.94318034,  0.95319181]])

In [63]:
#calculate explained variance
pca.explained_variance_ratio_

array([0.23915698, 0.14789208])

In [64]:
#creating a DF with PCA data
df_admits_PCA = pd.DataFrame(admissions_pca, columns=["PCA1", "PCA2"])

df_admits_PCA.head()

Unnamed: 0,PCA1,PCA2
0,0.522243,-0.047158
1,1.235232,-0.014869
2,1.53434,0.929205
3,-0.44611,-0.013056
4,-0.94318,0.953192


In [66]:
 # Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_admitss_pca DataFrame
model.fit(df_admits_PCA)

# Predict the model segments (clusters)
admit_clusters = model.predict(df_admits_PCA)

# Print the admit segments
print(admit_clusters)

[0 0 0 ... 1 0 1]


In [75]:
# Create a copy of the df_admits_pca DataFrame and name it as df_admitss_pca_predictions
df_admits_pca_predictions = df_admits_PCA.copy()

# Create a new column in the DataFrame with the predicted clusters
df_admits_pca_predictions["AdmitCluster"] = admit_clusters
df_admits_pca_predictions["AdmitCluster_target"] = df["admit_cluster"]

# Review the DataFrame
df_admits_pca_predictions.head()

Unnamed: 0,PCA1,PCA2,AdmitCluster,AdmitCluster_target
0,0.522243,-0.047158,0,
1,1.235232,-0.014869,0,2.0
2,1.53434,0.929205,0,0.0
3,-0.44611,-0.013056,1,2.0
4,-0.94318,0.953192,2,0.0


In [91]:
 # Create the scatter plot with x="PC1" and y="PC2"
plot1= df_admits_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="AdmitCluster",
    title = "Scatter Plot by Admit Segment - PCA=2")
hvplot.save(plot1, "PCA_2Segments.html")

In [92]:
 # Create the scatter plot with x="PC1" and y="PC2"
plot2 = df_admits_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="AdmitCluster_target",
    title = "Scatter Plot by Admit Segment - PCA=2")
hvplot.save(plot2, "PCA_2segments_actuals.html")

PCA - more features

In [77]:
#Reduce number of features to four principal components
#create PCA model instance where n_components = 4
pca = PCA(n_components=4)

In [78]:
#fit scaled_admissions_df data to the PCA
admissions_pca = pca.fit_transform(scaled_admissions_df)

admissions_pca[:5]

array([[ 0.52224263, -0.04715818,  1.81084189, -0.12544708],
       [ 1.23523242, -0.01486924, -0.46814126,  1.06103383],
       [ 1.53434008,  0.92920451,  1.6950363 , -0.32995726],
       [-0.44611008, -0.01305631, -0.32292826, -0.79727863],
       [-0.94318034,  0.95319181, -0.26434281, -0.60235422]])

In [79]:
#calculate explained variance
pca.explained_variance_ratio_

array([0.23915698, 0.14789208, 0.07516895, 0.06946059])

In [81]:
#creating a DF with PCA data
df_admits_PCA = pd.DataFrame(admissions_pca, columns=["PCA1", "PCA2","PCA3","PCA4"])

df_admits_PCA.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4
0,0.522243,-0.047158,1.810842,-0.125447
1,1.235232,-0.014869,-0.468141,1.061034
2,1.53434,0.929205,1.695036,-0.329957
3,-0.44611,-0.013056,-0.322928,-0.797279
4,-0.94318,0.953192,-0.264343,-0.602354


In [82]:
 # Initialize the K-Means model with n_clusters=3
model = KMeans(n_clusters=3)

# Fit the model for the df_admitss_pca DataFrame
model.fit(df_admits_PCA)

# Predict the model segments (clusters)
admit_clusters = model.predict(df_admits_PCA)

# Print the admit segments
print(admit_clusters)

[0 0 0 ... 1 0 2]


In [83]:
# Create a copy of the df_admits_pca DataFrame and name it as df_admitss_pca_predictions
df_admits_pca_predictions = df_admits_PCA.copy()

# Create a new column in the DataFrame with the predicted clusters
df_admits_pca_predictions["AdmitCluster"] = admit_clusters
df_admits_pca_predictions["AdmitCluster_target"] = df["admit_cluster"]

# Review the DataFrame
df_admits_pca_predictions.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,AdmitCluster,AdmitCluster_target
0,0.522243,-0.047158,1.810842,-0.125447,0,
1,1.235232,-0.014869,-0.468141,1.061034,0,2.0
2,1.53434,0.929205,1.695036,-0.329957,0,0.0
3,-0.44611,-0.013056,-0.322928,-0.797279,2,2.0
4,-0.94318,0.953192,-0.264343,-0.602354,2,0.0


In [93]:
 # Create the scatter plot with x="PC1" and y="PC3"
plot3= df_admits_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA3",
    by="AdmitCluster",
    title = "Scatter Plot by Admit Segment - PCA=4")
hvplot.save(plot3, "PCA_4seg.html")

In [94]:
 # Create the scatter plot with x="PC1" and y="PC3"
plot4 = df_admits_pca_predictions.hvplot.scatter(
    x="PCA1",
    y="PCA3",
    by="AdmitCluster_target",
    title = "Scatter Plot by Admit Segment - PCA=2")
hvplot.save(plot4, "PCA_4seg_actuals.html")