In [None]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Import data from Google Drive
from google.colab import drive
drive.mount('/content/drive')
project_dir = 'drive/MyDrive/Capstone/'
dataset = pd.read_excel(project_dir+'FT100.xlsx',sheet_name=['2022','2021','2020','2019','3_years','feature_engineer'])

df = dataset['feature_engineer']

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# PCA

In [None]:
# Define the wrapper_method columns
wrapper_method = ['Overall satisfaction', 'Career service rank', 'Women on board (%)',
                  'International course experience rank', 'International mobility rank',
                  'Employed in three months', 'Career progress rank',
                  'Female students (%)', 'Female faculty (%)',
                  'International students (%)', 'Weighted salary (US$)',
                  'Salary percentage increase', 'Salary today (US$)',
                  'Average course length (months)',
                  'Overall satisfaction Initial-Final Difference']

df_feature_engineer_numeric = df[wrapper_method]


# Standardizing the numeric features
scaler = StandardScaler()
df_feature_engineer_scaled = scaler.fit_transform(df_feature_engineer_numeric)

# Applying PCA, choosing to keep 95% of the variance
pca = PCA(n_components=0.97)
principalComponents = pca.fit_transform(df_feature_engineer_scaled)

# The number of components chosen by PCA
num_components_chosen = pca.n_components_

# Converting the principal components for each sample into a DataFrame
principalDf = pd.DataFrame(data=principalComponents)

# Explained variance ratio for the chosen components
explained_variance = pca.explained_variance_ratio_

# Output the results
num_components_chosen, explained_variance, principalDf.head()

(13,
 array([0.2268944 , 0.1627794 , 0.11978492, 0.09239857, 0.06931207,
        0.0609077 , 0.05713166, 0.04751026, 0.04112893, 0.03162673,
        0.02715361, 0.0196186 , 0.01726961]),
          0         1         2         3         4         5         6   \
 0 -0.771993  1.446668  0.660649 -0.433984 -0.539997 -0.385631 -0.195525   
 1 -0.713209  1.880793  0.549272 -0.560607 -0.476799 -0.519667 -0.135181   
 2 -0.577019  1.871547  0.548184 -1.068927 -0.314952 -1.359147  0.409005   
 3 -2.011504 -2.463478  1.433556 -1.883963  2.364383 -1.576133 -1.550156   
 4 -1.376468 -2.245742  0.976337 -0.643182  1.991077 -1.330490 -0.159350   
 
          7         8         9         10        11        12  
 0  0.357804 -0.104173 -0.238716 -0.376212 -0.379406 -0.440215  
 1  0.168490 -0.172250  0.190686 -0.268305 -0.672696 -0.413935  
 2  0.589203 -0.762907  0.092990 -0.322770 -0.539297 -0.574793  
 3  1.216869 -1.114916 -0.937483  0.388513  0.173719  0.199903  
 4  0.942947  0.256735 -0.3768

In [None]:
np.mean(df_feature_engineer_scaled),np.std(df_feature_engineer_scaled)

(-3.266863152920001e-17, 1.0)

In [None]:
feat_cols = ['feature'+str(i) for i in range(df_feature_engineer_scaled.shape[1])]
normalized_df = pd.DataFrame(df_feature_engineer_scaled,columns=feat_cols)
normalized_df

Unnamed: 0,feature0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14
0,-0.329938,0.571289,0.435547,-0.100204,0.713704,-0.537356,0.571289,-0.214539,-0.285249,-1.183416,-0.569563,-0.366649,-0.508139,0.532359,0.440756
1,-0.237581,0.820423,0.435547,-0.172043,1.033987,-0.545730,1.247509,-0.318233,-0.174953,-1.081740,-0.418785,-0.347124,-0.324913,0.532359,0.440756
2,0.062580,0.535698,0.435547,-0.315722,0.891639,-1.748414,1.389871,-0.110845,-0.505842,-1.217308,-0.253510,-0.611137,-0.193790,0.532359,0.440756
3,1.748096,-1.065874,0.435547,1.803544,1.496618,-2.619287,-1.208236,0.926094,0.597121,1.663514,-1.799742,-0.155551,-2.019343,-1.131068,-0.013831
4,0.432008,-1.208236,0.358996,1.300667,0.856052,-1.504528,-0.745560,1.859340,-0.285249,1.222918,-1.081532,0.878432,-1.139214,-0.853830,-0.441678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,0.455097,-0.638788,0.052794,1.588025,0.251073,0.440805,-1.706503,1.340870,0.597121,1.494054,-0.305240,2.553689,-0.187909,-0.853830,-0.414937
286,-0.468473,-0.140521,-0.483061,1.444346,0.606943,0.394226,-1.670913,1.133482,0.707418,1.561838,-0.325263,2.279149,-0.193790,-0.853830,-0.414937
287,-1.138062,-1.065874,-0.942365,0.977389,0.784878,0.766859,-0.104931,0.303931,-0.947027,0.714537,-0.040675,-2.007892,0.136303,-0.853830,0.440756
288,0.709079,-0.745560,-0.942365,1.516186,-0.780951,-0.863410,-0.247293,-0.733009,-1.057323,0.138373,0.083100,-1.182743,0.287982,-0.784521,0.440756


Save PCA features in Dataframe

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error, median_absolute_error


# Assuming 'df_feature_engineer' is your original dataframe and '#' is your target column.
# Also assuming 'principalDf' is the DataFrame containing the principal components after PCA.
y = df['#']  # Target variable
X = principalDf  # Features set with principal components

# Split the dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBRegressor model
# Note: You might want to tune the hyperparameters for optimal performance.
xgb_model = XGBRegressor(
    objective='reg:squarederror',
    n_estimators = 486,
    learning_rate=0.21,
    max_depth=3
    )

# Train the model with the training data
xgb_model.fit(X_train, y_train)

# Predict the target variable for the testing data
y_pred = xgb_model.predict(X_test)

# Calculate the mean squared error of the predictions
mse = mean_squared_error(y_test, y_pred)

# Output the mean squared error
# Assuming y_test are the true values and y_pred are the model's predictions


mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
msle = mean_squared_log_error(y_test, y_pred)
medae = median_absolute_error(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Mean Squared Logarithmic Error (MSLE): {msle}")
print(f"Median Absolute Error: {medae}")


Mean Absolute Error (MAE): 9.328770789606818
Mean Squared Error (MSE): 135.98488966019744
Root Mean Squared Error (RMSE): 11.661255921220382
R-squared (R²): 0.8268186494647848
Mean Squared Logarithmic Error (MSLE): 0.1488183461818904
Median Absolute Error: 7.6631999015808105
