# Prep Glassdoor Dataset for Huggingface Analysis in Colab

In [2]:
import pandas as pd
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import os
import matplotlib.pyplot as plt

In [16]:
pd.set_option('display.max_rows', None)

In [3]:
# Load Glassdoor dataset
df = pd.read_parquet("/Users/marco/reviews_external.parquet.gzip")
df.loc[:, "reviewDateTime"] = pd.to_datetime(df.reviewDateTime)
df.loc[:, "year"] = df.reviewDateTime.apply(lambda x: x.year)

In [33]:
len(df)

7673953

In [7]:
#Load company names we will work with
companyNames = pd.read_csv("./data/glassdoor_company.csv", sep="\t")  

In [27]:
companyNames = list(companyNames["company_fullname"])

In [23]:
df["employerName"].head()

0                    Apple Inc.
1              Amazon.com, Inc.
2    Adobe Systems Incorporated
3         Microsoft Corporation
4            Oracle Corporation
Name: employerName, dtype: object

In [34]:
# Select only reviews at companies that we have the other data for
df = df[df["employerName"].isin(companyNames)]

In [35]:
len(df)

1095267

In [40]:
df.groupby("employerName")["reviewID"].count().sort_values(ascending=False)

employerName
Target Corporation                            34744
Amazon.com, Inc.                              28225
Wells Fargo & Company                         22743
AT&T Inc.                                     22051
Starbucks Corporation                         22038
McDonald's Corporation                        20754
Bank of America Corporation                   20268
Macy's, Inc.                                  19199
Verizon Communications Inc.                   19057
Best Buy Co., Inc.                            19049
Microsoft Corporation                         15678
JPMorgan Chase & Co.                          14644
United Parcel Service, Inc.                   13715
Apple Inc.                                    13456
Accenture Ltd                                 11706
Nordstrom, Inc.                               11698
Oracle Corporation                            11330
Cisco Systems, Inc.                           11238
Kohl's Corporation                            10459

In [45]:
numberOfReviewsPerCompany = df.groupby("employerName")["reviewID"].count().sort_values(ascending=False)
len(numberOfReviewsPerCompany)

1805

In [59]:
len(numberOfReviewsPerCompany[numberOfReviewsPerCompany > 300])

529

In [54]:
companyNameMoreThan300Reviews = list(numberOfReviewsPerCompany[numberOfReviewsPerCompany > 300].index)

In [56]:
# select only companies that we have at least 300 reviews for
df = df[df["employerName"].isin(companyNameMoreThan300Reviews)]

In [57]:
len(df)

1015079

In [61]:
sample = df.sample(frac=0.1)

In [62]:
len(sample)

101508

In [60]:
df.to_csv("./data/ReviewsFiltered.csv")