Import libraries and Read data

In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Import data from Google Drive
from google.colab import drive
drive.mount('/content/drive')
project_dir = 'drive/MyDrive/Capstone/'
dataset = pd.read_excel(project_dir+'FT100.xlsx',sheet_name=['2022','2021','2020','2019','3_years','feature_engineer'])

df = dataset['feature_engineer']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Select Y and *X*

In [None]:
#X
rank = df['#']

#Y
metrics = [
    'Overall satisfaction',
    'Career service rank',
    'Faculty with doctorates (%)',
    'Women on board (%)',
    'International course experience rank',
    'International faculty (%)',
    'International mobility rank',
    'Employed in three months',
    'Career progress rank',
    'Female students (%)',
    'Female faculty (%)',
    'Value for money rank',
    'International students (%)',
    'Internships (%)',
    'International board (%)',
    'Weighted salary (US$)',
    'Aims achieved (%)',
    'Company internships (%)',
    'Salary percentage increase',
    'Salary today (US$)',
    'Average course length (months)',
    #'Overall satisfaction Growth Rate',
    'Overall satisfaction Initial-Final Difference',
    'Weighted Salary Binary',
    #'Weighted Salary Category',
    'Gender Diversity Score']
    #'Overall satisfaction Z-score']

selection_1 = ['Faculty with doctorates (%)', 'Women on board (%)',
       'International course experience rank', 'International faculty (%)',
       'International mobility rank', 'Career progress rank',
       'Female students (%)', 'Female faculty (%)', 'Value for money rank',
       'International students (%)', 'Weighted salary (US$)',
       'Aims achieved (%)', 'Salary today (US$)',
       'Average course length (months)', 'Gender Diversity Score']

Train Regression Models

a) Gradient Boosting

In [None]:
# Import necessary libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load dataset
y = rank
X = df[metrics]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing data
scaler = StandardScaler()
scaler.fit(X_train)

# Initialize and train the GradientBoostingRegressor
gb_regressor = GradientBoostingRegressor(n_estimators=468, learning_rate=0.1, max_depth=4, random_state=42)
gb_regressor.fit(X_train, y_train)

# Make predictions
y_pred = gb_regressor.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error (MSE): 390.95258115402044
Root Mean Squared Error (RMSE): 19.77252085987066
Mean Absolute Error (MAE): 15.77270922925991
R^2 Score: 0.5453148627482003


b) XGradient Boosting

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load dataset
y = rank
X = df[metrics]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor
xgb_regressor = xgb.XGBRegressor(n_estimators=486, learning_rate=0.1, max_depth=4, random_state=42)

# Fit the regressor to the training data
xgb_regressor.fit(X_train, y_train)

# Predict the target on the testing set
y_pred = xgb_regressor.predict(X_test)

# Calculate the performance metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5  # square root of MSE
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print out the performance metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R^2 Score: {r2}")

Mean Squared Error (MSE): 388.7069206679063
Root Mean Squared Error (RMSE): 19.715651667340502
Mean Absolute Error (MAE): 16.5591449244269
R^2 Score: 0.5479266077412523


Feature Selection

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection with Filter Method
# Using SelectKBest to select the top k features
k = 15  # number of features to select
select_k_best = SelectKBest(f_classif, k=k)
X_train_selected = select_k_best.fit_transform(X_train, y_train)
X_test_selected = select_k_best.transform(X_test)

# Get selected feature names
selected_features = X.columns[select_k_best.get_support()]
print("Selected features (Filter Method):", selected_features)

# Feature selection with Wrapper Method
# Using Recursive Feature Elimination (RFE) with a logistic regression classifier
rfe_selector = RFE(estimator=LogisticRegression(max_iter=10000), n_features_to_select=k, step=1)
rfe_selector = rfe_selector.fit(X_train, y_train)
X_train_rfe = rfe_selector.transform(X_train)
X_test_rfe = rfe_selector.transform(X_test)

# Get selected feature names
selected_rfe_features = X.columns[rfe_selector.get_support()]
print("Selected features (Wrapper Method):", selected_rfe_features)

# Feature selection with Embedded Method
# Using RandomForestClassifier feature importances
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Select top k features
selected_features_embedded = X.columns[indices[:k]]
print("Selected features (Embedded Method):", selected_features_embedded)

# You can now use X_train_selected, X_train_rfe, or the top k features from the random forest for training your models

Selected features (Filter Method): Index(['Faculty with doctorates (%)', 'Women on board (%)',
       'International course experience rank', 'International faculty (%)',
       'International mobility rank', 'Career progress rank',
       'Female students (%)', 'Female faculty (%)', 'Value for money rank',
       'International students (%)', 'Weighted salary (US$)',
       'Aims achieved (%)', 'Salary today (US$)',
       'Average course length (months)', 'Gender Diversity Score'],
      dtype='object')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Selected features (Wrapper Method): Index(['Career service rank', 'Faculty with doctorates (%)',
       'Women on board (%)', 'International course experience rank',
       'International faculty (%)', 'International mobility rank',
       'Employed in three months', 'Career progress rank',
       'Female students (%)', 'Female faculty (%)', 'Value for money rank',
       'International students (%)', 'International board (%)',
       'Salary percentage increase', 'Gender Diversity Score'],
      dtype='object')
Selected features (Embedded Method): Index(['International faculty (%)', 'International course experience rank',
       'Value for money rank', 'International board (%)',
       'Weighted salary (US$)', 'Average course length (months)',
       'Career progress rank', 'Female faculty (%)',
       'Salary percentage increase', 'Women on board (%)',
       'International mobility rank', 'Gender Diversity Score',
       'Aims achieved (%)', 'Career service rank', 'Employed in three