### Dependencies

In [1]:
# Install Modules
!pip install keras_tuner -q

# Load Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
import keras_tuner as kt
import warnings
import numpy as np

# Disable all warnings
warnings.filterwarnings("ignore")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m174.1/176.1 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Preprocessing

### Load Data

In [2]:
def load_join(select, loaded):
    selected_df = pd.read_csv("https://www.dropbox.com/s/" + select + "?dl=1", index_col="review_index")
    if loaded is not None:
        selected_df = pd.merge(loaded, selected_df, on='review_index')
    return selected_df

In [3]:
load_metadata = True
load_ratings = True
load_opinions = True
load_descriptions = False
load_world_happiness_report = False

working_df = None
if load_metadata == True:
    load_index = "goqyfyeu2qvpsmo/metadata_df"
    working_df = load_join(load_index, working_df)
if load_ratings == True:
    load_index = "9j2j86xwqrmrljx/ratings_df.csv"
    working_df = load_join(load_index, working_df)
if load_opinions == True:
    load_index = "q8v9f6rbb4z12df/opinion_df.csv"
    working_df = load_join(load_index, working_df)
if load_descriptions == True:
    load_index = "hnp8sebleh6dzgt/descriptions_df.csv"
    working_df = load_join(load_index, working_df)
    
if load_world_happiness_report == True:
    WHR_df = pd.read_csv("https://www.dropbox.com/s/jyr4e7fleevrb1s/WHR2023.csv?dl=1")

In [4]:
unedited_working_df = working_df.copy()
working_df = working_df.drop(['firm', 'date_review', 'job_title', 'location'], axis=1)
working_df.head(3)

Unnamed: 0_level_0,current,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook
review_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Current Employee,2,4.0,3.0,,2.0,3.0,3.0,x,o,r
1,"Current Employee, more than 1 year",2,3.0,1.0,,2.0,1.0,4.0,x,o,r
2,"Current Employee, less than 1 year",1,1.0,1.0,,1.0,1.0,1.0,x,o,x


### Data Cleaning

#### Cleaning Data Attempt - Drop Less Columns

In [162]:
preprocessing_df = unedited_working_df.copy()
preprocessing_df = preprocessing_df.drop(['date_review', 'job_title', 'location'], axis=1)

In [163]:
preprocessing_df.nunique()

firm                   428
current                 29
overall_rating           5
work_life_balance        5
culture_values           5
diversity_inclusion      5
career_opp               5
comp_benefits            5
senior_mgmt              5
recommend                3
ceo_approv               4
outlook                  4
dtype: int64

In [164]:
cutoff_value = 2000
firms_to_replace = preprocessing_df['firm'].value_counts()[preprocessing_df['firm'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for firm in firms_to_replace:
    preprocessing_df['firm'] = preprocessing_df['firm'].replace(firm,"Other")
    
# Check to make sure binning was successful
print(preprocessing_df['firm'].value_counts())
print("These ar how many samples that got 'firm' changed to Other: ", len(preprocessing_df[preprocessing_df['firm'] == "Other"]))
print("Number of firms removed: ", len(firms_to_replace))

Other                 132211
IBM                    60436
McDonald-s             49450
Deloitte               46995
EY                     34050
                       ...  
Bayer                   2441
BDO                     2422
The-Salvation-Army      2356
Accenture               2156
Co-op                   2065
Name: firm, Length: 77, dtype: int64
These ar how many samples that got 'firm' changed to Other:  132211
Number of firms removed:  352


In [165]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10000
current_to_replace = preprocessing_df['current'].value_counts()[preprocessing_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    preprocessing_df['current'] = preprocessing_df['current'].replace(review,"Other")

# Check to make sure binning was successful
preprocessing_df['current'].value_counts()

Current Employee                        209599
Former Employee                         146133
Current Employee, more than 1 year       82749
Current Employee, more than 3 years      66471
Former Employee, more than 1 year        65687
Current Employee, less than 1 year       49603
Former Employee, more than 3 years       43614
Former Employee, less than 1 year        41874
Current Employee, more than 5 years      40155
Current Employee, more than 10 years     25029
Former Employee, more than 5 years       23017
Current Employee, more than 8 years      18506
Former Employee, more than 10 years      15411
Former Employee, more than 8 years       10686
Other                                       32
Name: current, dtype: int64

In [166]:
print("Columns before processing: ", len(preprocessing_df))
columns_to_convert = ['overall_rating', 'work_life_balance', 'culture_values',
                      'diversity_inclusion', 'career_opp', 'comp_benefits',
                      'senior_mgmt']

preprocessing_df[columns_to_convert] = preprocessing_df[columns_to_convert].astype(float)
preprocessing_df.dropna(subset=columns_to_convert, inplace=True)
print("Column after dropping NaN values: ", len(preprocessing_df))

Columns before processing:  838566
Column after dropping NaN values:  133863


In [167]:
X = pd.get_dummies(preprocessing_df.drop('overall_rating', axis = 1)).drop(['recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
y = pd.get_dummies(preprocessing_df['overall_rating'].astype(int))

In [168]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(100397, 106)

#### Cleaning Data Attempt - Half Hot-Ended Remove NaN

In [5]:
working_half_hotend_df = unedited_working_df.copy()
working_half_hotend_df = working_half_hotend_df.drop(['date_review', 'job_title', 'location'], axis=1)

columns_to_convert = ['overall_rating', 'work_life_balance', 'culture_values',
                      'diversity_inclusion', 'career_opp', 'comp_benefits',
                      'senior_mgmt']

working_half_hotend_df[columns_to_convert] = working_half_hotend_df[columns_to_convert].astype(float)

In [6]:
cutoff_value = 3000
firms_to_replace = working_half_hotend_df['firm'].value_counts()[working_half_hotend_df['firm'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for firm in firms_to_replace:
    working_half_hotend_df['firm'] = working_half_hotend_df['firm'].replace(firm,"Other")
    
# Check to make sure binning was successful
print(working_half_hotend_df['firm'].value_counts())
print("These ar how many samples that got 'firm' changed to Other: ", len(working_half_hotend_df[working_half_hotend_df['firm'] == "Other"]))
print("Number of firms removed: ", len(firms_to_replace))
print("Remaining number of firms: ", len(working_half_hotend_df['firm'].value_counts()))

Other                      178782
IBM                         60436
McDonald-s                  49450
Deloitte                    46995
EY                          34050
PwC                         33227
Oracle                      31941
Microsoft                   26675
J-P-Morgan                  25814
KPMG                        24815
Apple                       20797
Citi                        18726
Google                      15995
SAP                         14344
HSBC-Holdings               13893
Tesco                       12149
Marriott-International      10409
Barclays                     9710
Thomson-Reuters              9553
American-Express             9349
Morgan-Stanley               9093
Goldman-Sachs                8808
Vodafone                     8321
Salesforce                   8234
Pizza-Hut                    7592
BNY-Mellon                   6630
Deutsche-Bank                6388
Hilton                       6155
J-Sainsbury                  5925
GlaxoSmithKlin

In [7]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10
current_to_replace = working_half_hotend_df['current'].value_counts()[working_half_hotend_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    working_half_hotend_df['current'] = working_half_hotend_df['current'].replace(review,"Other")

# Check to make sure binning was successful
working_half_hotend_df['current'].value_counts()

Current Employee                        209599
Former Employee                         146133
Current Employee, more than 1 year       82749
Current Employee, more than 3 years      66471
Former Employee, more than 1 year        65687
Current Employee, less than 1 year       49603
Former Employee, more than 3 years       43614
Former Employee, less than 1 year        41874
Current Employee, more than 5 years      40155
Current Employee, more than 10 years     25029
Former Employee, more than 5 years       23017
Current Employee, more than 8 years      18506
Former Employee, more than 10 years      15411
Former Employee, more than 8 years       10686
Other                                       32
Name: current, dtype: int64

In [8]:
print("Columns before processing: ", len(working_half_hotend_df))
working_half_hotend_df.dropna(subset=columns_to_convert, inplace=True)
print("Column after dropping NaN values: ", len(working_half_hotend_df))
working_half_hotend_df.nunique()

Columns before processing:  838566
Column after dropping NaN values:  133863


firm                   59
current                15
overall_rating          5
work_life_balance       5
culture_values          5
diversity_inclusion     5
career_opp              5
comp_benefits           5
senior_mgmt             5
recommend               3
ceo_approv              4
outlook                 4
dtype: int64

In [9]:
X = pd.get_dummies(working_half_hotend_df.drop('overall_rating', axis = 1)).drop(['recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
y = pd.get_dummies(working_half_hotend_df['overall_rating'].astype(int))

X.columns

Index(['work_life_balance', 'culture_values', 'diversity_inclusion',
       'career_opp', 'comp_benefits', 'senior_mgmt', 'firm_ASDA',
       'firm_American-Express', 'firm_Aon', 'firm_Apple', 'firm_AstraZeneca',
       'firm_BNP-Paribas', 'firm_BNY-Mellon', 'firm_BT',
       'firm_Bain-and-Company', 'firm_Barclays', 'firm_Bloomberg-L-P',
       'firm_Booking-com', 'firm_Boots', 'firm_Boston-Consulting-Group',
       'firm_CBRE', 'firm_Capita', 'firm_Citi', 'firm_Deloitte',
       'firm_Deutsche-Bank', 'firm_EY', 'firm_GlaxoSmithKline',
       'firm_Goldman-Sachs', 'firm_Google', 'firm_Grant-Thornton',
       'firm_HSBC-Holdings', 'firm_Hays', 'firm_Hilton', 'firm_Hyatt',
       'firm_IBM', 'firm_J-P-Morgan', 'firm_J-Sainsbury', 'firm_KPMG',
       'firm_LinkedIn', 'firm_Lloyds-Banking-Group', 'firm_Marks-and-Spencer',
       'firm_Marriott-International', 'firm_McDonald-s',
       'firm_McKinsey-and-Company', 'firm_Mercer', 'firm_Microsoft',
       'firm_Morgan-Stanley', 'firm_NHS', '

In [10]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(100397, 88)

#### Cleaning Data Attempt - PCA

In [255]:
# Create PCA Model
pca_model = PCA(n_components=25)

In [256]:
pca_model = pca_model.fit(X_train_scaled)

X_train_scaled = pca_model.transform(X_train_scaled)
X_test_scaled = pca_model.transform(X_test_scaled)

# Training DataFrame with the PCA components
pca_df = pd.DataFrame(data=X_train_scaled, columns=['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 
                                                'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20',
                                                'PC21', 'PC22', 'PC23', 'PC24', 'PC25'])#, 'PC26', 'PC27', 'PC28', 'PC29'])#, 'PC30'])
                                              #  'PC31', 'PC32', 'PC33', 'PC34', 'PC35'])

# Retrieve the explained variance from the PCA model
explained_variance = pca_model.explained_variance_ratio_
Scope = 0

# Print the explained variance for each principal component
for i, variance in enumerate(explained_variance):
    Scope = Scope + variance
    print(f"Explained Variance PC{i+1}: {variance:.4f}")

print(f"{Scope * 100:.2f}%")

Explained Variance PC1: 0.1108
Explained Variance PC2: 0.0630
Explained Variance PC3: 0.0426
Explained Variance PC4: 0.0403
Explained Variance PC5: 0.0301
Explained Variance PC6: 0.0261
Explained Variance PC7: 0.0238
Explained Variance PC8: 0.0224
Explained Variance PC9: 0.0223
Explained Variance PC10: 0.0215
Explained Variance PC11: 0.0208
Explained Variance PC12: 0.0205
Explained Variance PC13: 0.0204
Explained Variance PC14: 0.0203
Explained Variance PC15: 0.0202
Explained Variance PC16: 0.0200
Explained Variance PC17: 0.0198
Explained Variance PC18: 0.0198
Explained Variance PC19: 0.0195
Explained Variance PC20: 0.0193
Explained Variance PC21: 0.0192
Explained Variance PC22: 0.0191
Explained Variance PC23: 0.0191
Explained Variance PC24: 0.0189
Explained Variance PC25: 0.0187
69.86%


## NN Attempt - Compile, Train, & Evaluate Default Model

In [11]:
# Define the model
nn_default = tf.keras.models.Sequential()

# Add the masking layer
#nn_default.add(tf.keras.layers.Masking(mask_value=10, input_shape=(X_train_scaled.shape[1],)))

nn_default.add(tf.keras.layers.Dense(units=100, activation='relu', input_dim=X_train_scaled.shape[1]))

nn_default.add(tf.keras.layers.Dense(units=80, activation='relu'))

# Add the output layer
nn_default.add(tf.keras.layers.Dense(units=5, activation='sigmoid'))

# Check the structure of the model
nn_default.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               8900      
                                                                 
 dense_1 (Dense)             (None, 80)                8080      
                                                                 
 dense_2 (Dense)             (None, 5)                 405       
                                                                 
Total params: 17,385
Trainable params: 17,385
Non-trainable params: 0
_________________________________________________________________


In [12]:
# Compile the model
nn_default.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Train the model
history = nn_default.fit(X_train_scaled, y_train, epochs=20, verbose=1, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_default.evaluate(X_test_scaled,y_test,verbose=1, batch_size=64)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.0911693423986435, Accuracy: 0.6613278985023499


In [14]:
predictions = pd.DataFrame(nn_default.predict(X_test_scaled))



In [15]:
predictions = predictions.rename(columns={0: 1, 1: 2, 2: 3, 3: 4, 4: 5})
predictions['y_predict'] = predictions.idxmax(axis=1)
predictions['y_actual'] = (y_test.idxmax(axis=1)).reset_index(drop=True)
predictions[1:10]

Unnamed: 0,1,2,3,4,5,y_predict,y_actual
1,9.6e-05,7.2e-05,0.030257,0.676589,0.272983,4,4
2,0.00019,0.001124,0.097513,0.572497,0.337251,4,5
3,0.000503,0.010779,0.11124,0.66936,0.147367,4,1
4,0.943163,0.073102,0.010667,0.00224,0.010728,1,1
5,0.000293,0.00042,0.01649,0.752442,0.235446,4,4
6,0.001074,0.022121,0.46064,0.378533,0.019041,3,3
7,0.89061,0.12006,0.017462,0.009019,0.023141,1,1
8,0.000236,0.000867,0.008928,0.592259,0.381511,4,5
9,0.000884,0.000271,0.068241,0.719579,0.187216,4,4


In [16]:
difference = 0

for index, row in predictions.iterrows():
    # Get the absolute difference between 'y_predict' and 'y_actual' columns
    diff = abs(row['y_predict'] - row['y_actual'])
    
    # Add the difference to the total
    difference += diff

difference = difference / len(predictions)
print("Mean points delta actual value: ", difference)

Mean points delta actual value:  0.3722285304488137


Hot Ended - Mean points delta actual value:  0.43129239369973577

Half Hot Ended Remove NaN - Mean points delta actual value:  0.36371242455028985

Half Hot Ended Mask NaN - Mean points delta actual value:  0.4334150599593593