In [4]:
# Install Modules
!pip install keras_tuner -q

# Load Dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import tensorflow as tf
import keras_tuner as kt
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.1/176.1 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25h

## Preprocessing - Load Data

In [84]:
def load_join(select, loaded):
    selected_df = pd.read_csv("https://www.dropbox.com/s/" + select + "?dl=1", index_col="review_index")
    if loaded is not None:
        selected_df = pd.merge(loaded, selected_df, on='review_index')
    return selected_df

In [85]:
load_metadata = True
load_ratings = True
load_opinions = True
load_descriptions = False
load_world_happiness_report = False

working_df = None
if load_metadata == True:
    load_index = "goqyfyeu2qvpsmo/metadata_df"
    working_df = load_join(load_index, working_df)
if load_ratings == True:
    load_index = "9j2j86xwqrmrljx/ratings_df.csv"
    working_df = load_join(load_index, working_df)
if load_opinions == True:
    load_index = "q8v9f6rbb4z12df/opinion_df.csv"
    working_df = load_join(load_index, working_df)
if load_descriptions == True:
    load_index = "hnp8sebleh6dzgt/descriptions_df.csv"
    working_df = load_join(load_index, working_df)
    
if load_world_happiness_report == True:
    WHR_df = pd.read_csv("https://www.dropbox.com/s/jyr4e7fleevrb1s/WHR2023.csv?dl=1")

In [86]:
working_df = working_df.drop(['firm', 'date_review', 'job_title', 'location'], axis=1)
working_df.head(3)

Unnamed: 0_level_0,current,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook
review_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Current Employee,2,4.0,3.0,,2.0,3.0,3.0,x,o,r
1,"Current Employee, more than 1 year",2,3.0,1.0,,2.0,1.0,4.0,x,o,r
2,"Current Employee, less than 1 year",1,1.0,1.0,,1.0,1.0,1.0,x,o,x


## Preprocessing - Cleaning Data

In [87]:
working_df = working_df.astype(str)

In [88]:
# Cutoff value to make a list of values in the 'current' column to put into "other" catagory
cutoff_value = 10000
current_to_replace = working_df['current'].value_counts()[working_df['current'].value_counts() < cutoff_value].index.tolist()

# Replace in dataframe
for review in current_to_replace:
    working_df['current'] = working_df['current'].replace(review,"Other")

# Check to make sure binning was successful
working_df['current'].value_counts()

Current Employee                        209599
Former Employee                         146133
Current Employee, more than 1 year       82749
Current Employee, more than 3 years      66471
Former Employee, more than 1 year        65687
Current Employee, less than 1 year       49603
Former Employee, more than 3 years       43614
Former Employee, less than 1 year        41874
Current Employee, more than 5 years      40155
Current Employee, more than 10 years     25029
Former Employee, more than 5 years       23017
Current Employee, more than 8 years      18506
Former Employee, more than 10 years      15411
Former Employee, more than 8 years       10686
Other                                       32
Name: current, dtype: int64

In [113]:
working_df.nunique()

current                15
overall_rating          5
work_life_balance       6
culture_values          6
diversity_inclusion     6
career_opp              6
comp_benefits           6
senior_mgmt             6
recommend               3
ceo_approv              4
outlook                 4
dtype: int64

In [115]:
X = pd.get_dummies(working_df.drop('overall_rating', axis = 1))
y = pd.get_dummies(working_df['overall_rating'].astype(int))

X = X.drop(['work_life_balance_nan', 'culture_values_nan', 'diversity_inclusion_nan', 'career_opp_nan', 
           'comp_benefits_nan', 'senior_mgmt_nan', 'recommend_o', 'ceo_approv_o', 'outlook_o'], axis = 1)
X.columns

Index(['current_Current Employee',
       'current_Current Employee, less than 1 year',
       'current_Current Employee, more than 1 year',
       'current_Current Employee, more than 10 years',
       'current_Current Employee, more than 3 years',
       'current_Current Employee, more than 5 years',
       'current_Current Employee, more than 8 years',
       'current_Former Employee', 'current_Former Employee, less than 1 year',
       'current_Former Employee, more than 1 year',
       'current_Former Employee, more than 10 years',
       'current_Former Employee, more than 3 years',
       'current_Former Employee, more than 5 years',
       'current_Former Employee, more than 8 years', 'current_Other',
       'work_life_balance_1.0', 'work_life_balance_2.0',
       'work_life_balance_3.0', 'work_life_balance_4.0',
       'work_life_balance_5.0', 'culture_values_1.0', 'culture_values_2.0',
       'culture_values_3.0', 'culture_values_4.0', 'culture_values_5.0',
       'diversity_

In [116]:
# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

X_train_scaled.shape

(628924, 53)

## NN Attempt - Compile, Train, & Evaluate Default Model

In [120]:
# Define the model
nn_default = tf.keras.models.Sequential()

# First hidden layer
nn_default.add(tf.keras.layers.Dense(units=80, activation='relu', input_shape=(X_train_scaled.shape[1],)))

# Second hidden layer
nn_default.add(tf.keras.layers.Dense(units=30, activation='relu'))

# Output layer
nn_default.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))

# Check the structure of the model
nn_default.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 80)                4320      
                                                                 
 dense_13 (Dense)            (None, 30)                2430      
                                                                 
 dense_14 (Dense)            (None, 1)                 31        
                                                                 
Total params: 6,781
Trainable params: 6,781
Non-trainable params: 0
_________________________________________________________________


In [122]:
# Compile the model
nn_default.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

# Train the model
history = nn_default.fit(X_train_scaled, y_train, epochs=10, verbose=1, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
 180/9827 [..............................] - ETA: 11s - loss: 8.3984 - accuracy: 0.0708

KeyboardInterrupt: ignored