## Data Loading

In [13]:
import gdown
import pandas as pd
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

WHR_url = "https://drive.google.com/uc?export=download&id=1-9I398x5bb26U7D7VUm6Xwa_qgY-IWsN"
WHR_output = "WHR2023.csv"
gdown.download(WHR_url, WHR_output, quiet=True)

reviews_url = "https://drive.google.com/uc?export=download&id=19GZEU9YdkxvOd7rtgp7bklWi7HWtE-nT"
reviews_output = "glassdoor_reviews.csv"
gdown.download(reviews_url, reviews_output, quiet=True)

WHR_df = pd.read_csv("WHR2023.csv")
reviews_df = pd.read_csv("glassdoor_reviews.csv")

In [14]:
WHR_df.head(3)

Unnamed: 0,Country name,Ladder score,Standard error of ladder score,upperwhisker,lowerwhisker,Logged GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,Ladder score in Dystopia,Explained by: Log GDP per capita,Explained by: Social support,Explained by: Healthy life expectancy,Explained by: Freedom to make life choices,Explained by: Generosity,Explained by: Perceptions of corruption,Dystopia + residual
0,Finland,7.804,0.036,7.875,7.733,10.792,0.969,71.15,0.961,-0.019,0.182,1.778,1.888,1.585,0.535,0.772,0.126,0.535,2.363
1,Denmark,7.586,0.041,7.667,7.506,10.962,0.954,71.25,0.934,0.134,0.196,1.778,1.949,1.548,0.537,0.734,0.208,0.525,2.084
2,Iceland,7.53,0.049,7.625,7.434,10.896,0.983,72.05,0.936,0.211,0.668,1.778,1.926,1.62,0.559,0.738,0.25,0.187,2.25


In [15]:
reviews_df = reviews_df.reset_index().rename(columns={'index': 'review_index'})
print(len(reviews_df))
reviews_df.head(3)

838566


Unnamed: 0,review_index,firm,date_review,job_title,current,location,overall_rating,work_life_balance,culture_values,diversity_inclusion,career_opp,comp_benefits,senior_mgmt,recommend,ceo_approv,outlook,headline,pros,cons
0,0,AFH-Wealth-Management,2015-04-05,,Current Employee,,2,4.0,3.0,,2.0,3.0,3.0,x,o,r,"Young colleagues, poor micro management",Very friendly and welcoming to new staff. Easy...,"Poor salaries, poor training and communication."
1,1,AFH-Wealth-Management,2015-12-11,Office Administrator,"Current Employee, more than 1 year","Bromsgrove, England, England",2,3.0,1.0,,2.0,1.0,4.0,x,o,r,"Excellent staff, poor salary","Friendly, helpful and hard-working colleagues",Poor salary which doesn't improve much with pr...
2,2,AFH-Wealth-Management,2016-01-28,Office Administrator,"Current Employee, less than 1 year","Bromsgrove, England, England",1,1.0,1.0,,1.0,1.0,1.0,x,o,x,"Low salary, bad micromanagement",Easy to get the job even without experience in...,"Very low salary, poor working conditions, very..."


In [16]:
# Create descriptions_df
descriptions_df = reviews_df[['review_index', 'headline', 'pros', 'cons']].copy()

# Create opinion_df
opinion_df = reviews_df[['review_index', 'recommend', 'ceo_approv', 'outlook']].copy()

# Create ratings_df
ratings_df = reviews_df[['review_index', 'overall_rating', 'work_life_balance', 'culture_values',
                         'diversity_inclusion', 'career_opp', 'comp_benefits', 'senior_mgmt']].copy()

# Create metadata_df
metadata_df = reviews_df[['review_index', 'firm', 'date_review', 'job_title', 'current', 'location']].copy()

descriptions_df.to_csv("descriptions_df.csv", index=False)
opinion_df.to_csv("opinion_df.csv", index=False)
ratings_df.to_csv("ratings_df.csv", index=False)
metadata_df.to_csv("metadata_df.csv", index=False)

## Data Cleaning

In [17]:
metadata_df.nunique()

review_index    838566
firm               428
date_review       4813
job_title        62275
current             29
location         14487
dtype: int64

In [51]:
metadata_df['location'] = metadata_df['location'].fillna('')  # Replace missing values with an empty string
#metadata_df['location'] = metadata_df['location'].str.replace(r'(?i)England', 'United Kingdom')
#metadata_df['location'] = metadata_df['location'].str.replace(r'(?i)United Kingdom', 'United Kingdom')
United_Kingdom_instance = metadata_df[metadata_df['location'].str.contains('United Kingdom', case=False)]
metadata_df.loc[United_Kingdom_instance.index, 'location'] = 'United Kingdom'
US_instance = metadata_df[metadata_df['location'].str.contains('NY|AL', case=True)]

#capital_instances = metadata_df[metadata_df['location'].str.contains(r'[A-Z]{2}', na=False)]
#capital_instances['location'].value_counts()
US_instance['location'].value_counts()

New York, NY               31172
Armonk, NY                   767
Brooklyn, NY                 569
Buffalo, NY                  395
Purchase, NY                 308
                           ...  
Helena, AL                     1
Holbrook, NY                   1
South Valley Stream, NY        1
Albertson, NY                  1
Market, NY                     1
Name: location, Length: 610, dtype: int64

In [40]:
metadata_df['location'].value_counts()

                         297338
United Kingdom           136912
New York, NY              31172
Bangalore                 28102
Hyderābād                 11458
                          ...  
East Alton, IL                1
Duquesne, PA                  1
Sheridan, CO                  1
Peachtree Corners, GA         1
Wijnegem, Antwerp             1
Name: location, Length: 12435, dtype: int64

In [41]:
metadata_df['location'].nunique()

12435

## Data Preprocessing