### Dependencies

In [1]:
# Install Modules
!pip install keras_tuner -q

# Load Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
import keras_tuner as kt
import warnings
import numpy as np

# Disable all warnings
warnings.filterwarnings("ignore")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25h

## Preprocessing

### Load Data

In [16]:
def load_join(select, loaded):
    selected_df = pd.read_csv("https://www.dropbox.com/s/" + select + "?dl=1", index_col="review_index")
    if loaded is not None:
        selected_df = pd.merge(loaded, selected_df, on='review_index')
    return selected_df

In [17]:
load_metadata = True
load_ratings = True
load_opinions = True
load_descriptions = False
load_world_happiness_report = False

working_df = None
if load_metadata == True:
    load_index = "goqyfyeu2qvpsmo/metadata_df"
    working_df = load_join(load_index, working_df)
if load_ratings == True:
    load_index = "9j2j86xwqrmrljx/ratings_df.csv"
    working_df = load_join(load_index, working_df)
if load_opinions == True:
    load_index = "q8v9f6rbb4z12df/opinion_df.csv"
    working_df = load_join(load_index, working_df)
if load_descriptions == True:
    load_index = "hnp8sebleh6dzgt/descriptions_df.csv"
    working_df = load_join(load_index, working_df)
    
if load_world_happiness_report == True:
    WHR_df = pd.read_csv("https://www.dropbox.com/s/jyr4e7fleevrb1s/WHR2023.csv?dl=1")

In [18]:
unedited_working_df = working_df.copy()
working_df = working_df.drop(['firm', 'date_review', 'job_title', 'location'], axis=1)
working_df.head(3)

Unnamed: 0_level_0,current,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook
review_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Current Employee,2,4.0,3.0,,2.0,3.0,3.0,x,o,r
1,"Current Employee, more than 1 year",2,3.0,1.0,,2.0,1.0,4.0,x,o,r
2,"Current Employee, less than 1 year",1,1.0,1.0,,1.0,1.0,1.0,x,o,x


### Data Cleaning

#### Cleaning Data Attempt - Drop Less Columns

In [112]:
preprocessing_df = unedited_working_df.copy()
preprocessing_df = preprocessing_df.drop(['date_review', 'location', 'job_title'], axis=1)

In [113]:
cutoff_value = 2000
firms_to_replace = preprocessing_df['firm'].value_counts()[preprocessing_df['firm'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for firm in firms_to_replace:
    preprocessing_df['firm'] = preprocessing_df['firm'].replace(firm,"Other")
    
# Check to make sure binning was successful
print(preprocessing_df['firm'].value_counts())
print("These ar how many samples that got 'firm' changed to Other: ", len(preprocessing_df[preprocessing_df['firm'] == "Other"]))
print("Number of firms removed: ", len(firms_to_replace))

Other                 132211
IBM                    60436
McDonald-s             49450
Deloitte               46995
EY                     34050
                       ...  
Bayer                   2441
BDO                     2422
The-Salvation-Army      2356
Accenture               2156
Co-op                   2065
Name: firm, Length: 77, dtype: int64
These ar how many samples that got 'firm' changed to Other:  132211
Number of firms removed:  352


In [114]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10000
current_to_replace = preprocessing_df['current'].value_counts()[preprocessing_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    preprocessing_df['current'] = preprocessing_df['current'].replace(review,"Other")

# Check to make sure binning was successful
preprocessing_df['current'].value_counts()

Current Employee                        209599
Former Employee                         146133
Current Employee, more than 1 year       82749
Current Employee, more than 3 years      66471
Former Employee, more than 1 year        65687
Current Employee, less than 1 year       49603
Former Employee, more than 3 years       43614
Former Employee, less than 1 year        41874
Current Employee, more than 5 years      40155
Current Employee, more than 10 years     25029
Former Employee, more than 5 years       23017
Current Employee, more than 8 years      18506
Former Employee, more than 10 years      15411
Former Employee, more than 8 years       10686
Other                                       32
Name: current, dtype: int64

In [115]:
print("Columns before processing: ", len(preprocessing_df))
columns_to_convert = ['overall_rating', 'work_life_balance', 'culture_values',
                      'diversity_inclusion', 'career_opp', 'comp_benefits',
                      'senior_mgmt']

preprocessing_df[columns_to_convert] = preprocessing_df[columns_to_convert].astype(float)
preprocessing_df.dropna(subset=columns_to_convert, inplace=True)
print("Column after dropping NaN values: ", len(preprocessing_df))

Columns before processing:  838566
Column after dropping NaN values:  133863


In [116]:
X = pd.get_dummies(preprocessing_df.drop('overall_rating', axis = 1)).drop(['recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
y = pd.get_dummies(preprocessing_df['overall_rating'].astype(int))

In [117]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(100397, 106)

#### Cleaning Data Attempt - Hot-Ended Method

In [88]:
working_hotend_df = working_df.copy().astype(str)
working_hotend_df = working_hotend_df.reset_index(drop=True)

In [89]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10000
current_to_replace = working_hotend_df['current'].value_counts()[working_hotend_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    working_hotend_df['current'] = working_hotend_df['current'].replace(review,"Other")

# Check to make sure binning was successful
working_hotend_df['current'].value_counts()

Current Employee                        209599
Former Employee                         146133
Current Employee, more than 1 year       82749
Current Employee, more than 3 years      66471
Former Employee, more than 1 year        65687
Current Employee, less than 1 year       49603
Former Employee, more than 3 years       43614
Former Employee, less than 1 year        41874
Current Employee, more than 5 years      40155
Current Employee, more than 10 years     25029
Former Employee, more than 5 years       23017
Current Employee, more than 8 years      18506
Former Employee, more than 10 years      15411
Former Employee, more than 8 years       10686
Other                                       32
Name: current, dtype: int64

In [90]:
working_hotend_df.nunique()

current                15
overall_rating          5
work_life_balance       6
culture_values          6
diversity_inclusion     6
career_opp              6
comp_benefits           6
senior_mgmt             6
recommend               3
ceo_approv              4
outlook                 4
dtype: int64

In [91]:
X = pd.get_dummies(working_hotend_df.drop('overall_rating', axis = 1))
y = pd.get_dummies(working_hotend_df['overall_rating'].astype(int))

X = X.drop(['work_life_balance_nan', 'culture_values_nan', 'diversity_inclusion_nan', 'career_opp_nan', 
           'comp_benefits_nan', 'senior_mgmt_nan', 'recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
X.columns

Index(['current_Current Employee',
       'current_Current Employee, less than 1 year',
       'current_Current Employee, more than 1 year',
       'current_Current Employee, more than 10 years',
       'current_Current Employee, more than 3 years',
       'current_Current Employee, more than 5 years',
       'current_Current Employee, more than 8 years',
       'current_Former Employee', 'current_Former Employee, less than 1 year',
       'current_Former Employee, more than 1 year',
       'current_Former Employee, more than 10 years',
       'current_Former Employee, more than 3 years',
       'current_Former Employee, more than 5 years',
       'current_Former Employee, more than 8 years', 'current_Other',
       'work_life_balance_1.0', 'work_life_balance_2.0',
       'work_life_balance_3.0', 'work_life_balance_4.0',
       'work_life_balance_5.0', 'culture_values_1.0', 'culture_values_2.0',
       'culture_values_3.0', 'culture_values_4.0', 'culture_values_5.0',
       'diversity_

In [92]:
y.columns

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [93]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(628924, 53)

#### Cleaning Data Attempt - Half Hot-Ended Remove NaN

In [5]:
working_half_hotend_df = working_df.copy()
print("Columns before processing: ", len(working_half_hotend_df))
columns_to_convert = ['overall_rating', 'work_life_balance', 'culture_values',
                      'diversity_inclusion', 'career_opp', 'comp_benefits',
                      'senior_mgmt']

working_half_hotend_df[columns_to_convert] = working_half_hotend_df[columns_to_convert].astype(float)
working_half_hotend_df.dropna(subset=columns_to_convert, inplace=True)
print("Column after dropping NaN values: ", len(working_half_hotend_df))

Columns before processing:  838566
Column after dropping NaN values:  133863


In [6]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10
current_to_replace = working_half_hotend_df['current'].value_counts()[working_half_hotend_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    working_half_hotend_df['current'] = working_half_hotend_df['current'].replace(review,"Other")

# Check to make sure binning was successful
working_half_hotend_df['current'].value_counts()

Current Employee                        27897
Current Employee, more than 1 year      16569
Former Employee                         14089
Former Employee, more than 1 year       13959
Current Employee, more than 3 years     13332
Former Employee, more than 3 years       8950
Former Employee, less than 1 year        8902
Current Employee, less than 1 year       7949
Current Employee, more than 5 years      7381
Former Employee, more than 5 years       4229
Current Employee, more than 8 years      3324
Current Employee, more than 10 years     3234
Former Employee, more than 8 years       2020
Former Employee, more than 10 years      2019
Other                                       9
Name: current, dtype: int64

In [7]:
working_half_hotend_df.nunique()

current                15
overall_rating          5
work_life_balance       5
culture_values          5
diversity_inclusion     5
career_opp              5
comp_benefits           5
senior_mgmt             5
recommend               3
ceo_approv              4
outlook                 4
dtype: int64

In [8]:
X = pd.get_dummies(working_half_hotend_df.drop('overall_rating', axis = 1)).drop(['recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
y = pd.get_dummies(working_half_hotend_df['overall_rating'].astype(int))

X.columns

Index(['work_life_balance', 'culture_values', 'diversity_inclusion',
       'career_opp', 'comp_benefits', 'senior_mgmt',
       'current_Current Employee',
       'current_Current Employee, less than 1 year',
       'current_Current Employee, more than 1 year',
       'current_Current Employee, more than 10 years',
       'current_Current Employee, more than 3 years',
       'current_Current Employee, more than 5 years',
       'current_Current Employee, more than 8 years',
       'current_Former Employee', 'current_Former Employee, less than 1 year',
       'current_Former Employee, more than 1 year',
       'current_Former Employee, more than 10 years',
       'current_Former Employee, more than 3 years',
       'current_Former Employee, more than 5 years',
       'current_Former Employee, more than 8 years', 'current_Other',
       'recommend_v', 'recommend_x', 'ceo_approv_r', 'ceo_approv_v',
       'ceo_approv_x', 'outlook_r', 'outlook_v', 'outlook_x'],
      dtype='object')

In [9]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(100397, 29)

#### Cleaning Data Attempt - Half Hot-Ended Masking NaN

In [None]:
mask_working_df = working_df.copy()
print("Columns before processing: ", len(mask_working_df))
columns_to_convert = ['overall_rating', 'work_life_balance', 'culture_values',
                      'diversity_inclusion', 'career_opp', 'comp_benefits',
                      'senior_mgmt']
                      
mask_working_df[columns_to_convert] = mask_working_df[columns_to_convert].astype(float)

In [None]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 1000
current_to_replace = mask_working_df['current'].value_counts()[mask_working_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    mask_working_df['current'] = mask_working_df['current'].replace(review,"Other")

# Check to make sure binning was successful
mask_working_df['current'].value_counts()

In [None]:
hot_end_columns = ['current', 'recommend', 'ceo_approv', 'outlook']
mask_working_df = (pd.get_dummies(mask_working_df, columns=hot_end_columns)).drop(['recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)

In [None]:
X = mask_working_df.drop('overall_rating', axis = 1)
y = pd.get_dummies(mask_working_df['overall_rating'].astype(int))
X.head()

In [None]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

In [None]:
X_train_scaled = np.nan_to_num(X_train_scaled, nan=10)
X_test_scaled = np.nan_to_num(X_test_scaled, nan=10)

## NN Attempt - Compile, Train, & Evaluate Default Model

In [125]:
# Define the model
nn_default = tf.keras.models.Sequential()

# Add the masking layer
nn_default.add(tf.keras.layers.Masking(mask_value=10, input_shape=(X_train_scaled.shape[1],)))

# Add the first hidden layer
nn_default.add(tf.keras.layers.Dense(units=80, activation='relu'))

# Add the second hidden layer
nn_default.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Add the output layer
nn_default.add(tf.keras.layers.Dense(units=5, activation='sigmoid'))

# Check the structure of the model
nn_default.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 masking_4 (Masking)         (None, 106)               0         
                                                                 
 dense_12 (Dense)            (None, 80)                8560      
                                                                 
 dense_13 (Dense)            (None, 30)                2430      
                                                                 
 dense_14 (Dense)            (None, 5)                 155       
                                                                 
Total params: 11,145
Trainable params: 11,145
Non-trainable params: 0
_________________________________________________________________


In [126]:
# Compile the model
nn_default.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Train the model
history = nn_default.fit(X_train_scaled, y_train, epochs=20, verbose=1, batch_size=64)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [127]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn_default.evaluate(X_test_scaled,y_test,verbose=1, batch_size=64)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 0.09097267687320709, Accuracy: 0.6620450615882874


In [128]:
predictions = pd.DataFrame(nn_default.predict(X_test_scaled))



In [129]:
predictions = predictions.rename(columns={0: 1, 1: 2, 2: 3, 3: 4, 4: 5})
predictions['y_predict'] = predictions.idxmax(axis=1)
predictions['y_actual'] = (y_test.idxmax(axis=1)).reset_index(drop=True)
predictions[1:10]

Unnamed: 0,1,2,3,4,5,y_predict,y_actual
1,2.4e-05,0.000458,0.031864,0.716882,0.253105,4,4
2,0.000817,0.001293,0.113885,0.605885,0.234679,4,5
3,0.006188,0.026284,0.329893,0.483585,0.116851,4,1
4,0.949283,0.057061,0.014787,0.00315,0.008702,1,1
5,3.9e-05,0.000104,0.011411,0.667435,0.298531,4,4
6,0.002738,0.211386,0.67188,0.136382,0.019643,3,3
7,0.907078,0.093326,0.028072,0.004121,0.009068,1,1
8,0.000246,0.000215,0.022103,0.508211,0.432774,4,5
9,0.000368,0.010831,0.047661,0.786832,0.158492,4,4


In [130]:
difference = 0

for index, row in predictions.iterrows():
    # Get the absolute difference between 'y_predict' and 'y_actual' columns
    diff = abs(row['y_predict'] - row['y_actual'])
    
    # Add the difference to the total
    difference += diff

difference = difference / len(predictions)
print("Mean points delta actual value: ", difference)

Mean points delta actual value:  0.37142174146895357


Hot Ended - Mean points delta actual value:  0.43129239369973577

Half Hot Ended Remove NaN - Mean points delta actual value:  0.36371242455028985

Half Hot Ended Mask NaN - Mean points delta actual value:  0.4334150599593593