Importing data from Google Sheets

Referencing snippet from here: https://colab.research.google.com/notebooks/snippets/sheets.ipynb#scrollTo=JiJVCmu3dhFa 

In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

In [None]:
worksheet = gc.open('allActivisionBlizzardGlassdoor').sheet1

In [None]:
rows = worksheet.get_all_values()
print(rows)



Converting the spreadsheet to a Pandas Dataframe

In [None]:
import pandas as pd

In [None]:
activision_df = pd.DataFrame.from_records(rows)
display(activision_df)

Unnamed: 0,0,1,2,3,4,5
0,title,author_info,rating,pros,cons,helpful
1,"Overall, great place","Sep 21, 2021 - Senior Web Developer in Boston, MA",5,good people great benefits everyone games any ...,hard work rushed deadlines bosses don't work a...,Be the first to find this review helpful
2,"Fun, but overworked.","May 18, 2022 - Functional Tester",3,Met some pretty incredible people. Can actuall...,Too many hours. Disorganized. Corprate doesn't...,Be the first to find this review helpful
3,Terrible Managers,"Apr 11, 2022 - Anonymous Employee",1,Will say the position is wfh but then when you...,They send you free stuff sometimes.,1 person found this review helpful
4,Great People - Weak Pay,"Mar 3, 2022 - Specialist in Santa Monica, CA",3,Awesome time to be in gaming. Working on big t...,Bad news headlines and lack of clear upward mo...,1 person found this review helpful
...,...,...,...,...,...,...
448,Amazing company,"Jan 22, 2022 - Anonymous Employee",5,"Great pay, Great hours, Great people","High speed, Time consuming, Long hours",Be the first to find this review helpful
449,Great Colleagues,"Jan 18, 2022 - Art Manager in Stockholm, Stock...",4,Wonderful People Work Life Balance Experienced...,Negativity because of scandals Poor communicat...,Be the first to find this review helpful
450,Tester,"Jan 25, 2022 - Anonymous Employee",3,Friendly colleges Interesting job tasks,Shifts no work life balance,Be the first to find this review helpful
451,Localization QA Tester,"Jan 24, 2022 - Localization QA Tester in Dubli...",2,Good work place and nice people,low salary and no long contract,Be the first to find this review helpful


In [None]:
# Designating the first row of the dataframe as the header
activision_df.columns = activision_df.iloc[0]
activision_df = activision_df[1:]
activision_df.head()

Unnamed: 0,title,author_info,rating,pros,cons,helpful
1,"Overall, great place","Sep 21, 2021 - Senior Web Developer in Boston, MA",5,good people great benefits everyone games any ...,hard work rushed deadlines bosses don't work a...,Be the first to find this review helpful
2,"Fun, but overworked.","May 18, 2022 - Functional Tester",3,Met some pretty incredible people. Can actuall...,Too many hours. Disorganized. Corprate doesn't...,Be the first to find this review helpful
3,Terrible Managers,"Apr 11, 2022 - Anonymous Employee",1,Will say the position is wfh but then when you...,They send you free stuff sometimes.,1 person found this review helpful
4,Great People - Weak Pay,"Mar 3, 2022 - Specialist in Santa Monica, CA",3,Awesome time to be in gaming. Working on big t...,Bad news headlines and lack of clear upward mo...,1 person found this review helpful
5,Poor Culture,"Feb 3, 2022 - Marketing Manager in Santa Monic...",2,Gaming benefits if that's your thing,Lack of any company culture,Be the first to find this review helpful


Cleaning up the dataframe by...

- Removing the `helpful` (last) column, which indicates how many Glassdoor users rated a review as "helpful." This information is not relevant to us.
- Parsing the date from the `author_info` (second) column. While job titles are not always provided – as review authors are not obligated by Glassdoor to state them — they are spliced out whenever they are. This allows us to only access the information we need: the date the review was posted.

In [None]:
# Remove last column by its title
activision_df = activision_df.drop('helpful', 1)

  


In [None]:
import datetime

In [None]:
# Helper function for date formatting (MM/DD/YY).
# i.e. Takes "Jan 1, 2000" as input, and returns "01/01/2000" as output.
# Note that both I/O are strings.
def format_date(original_date):

  date_components = original_date.split(' ')

  # Convert the month from abbreviated to numerical format.
  # Pad zeroes wherever appropriate.
  month_published = str(datetime.datetime.strptime(date_components[0], "%b").month).zfill(2)

  # Remove the trailing comma from the day (second item in list).
  # Again, pad zeroes wherever appropriate.
  day_published = date_components[1][0:-1].zfill(2)

  year_published = date_components[2]

  date_formatted = month_published + '/' + day_published + '/' + year_published
  return date_formatted

In [None]:
# Extract the date from the last column
for index, row in activision_df.iterrows():

  delimiter = ' - '
  split_info = row['author_info'].split(delimiter)
  
  date_published = ''

  # If a job title was provided by the reviewer, we splice it out.
  if len(split_info) > 1:
    date_published = split_info[0]

  # If no job title was provided, then the date is simply
  # the first item in the list, with the trailing space & hyphen excluded.
  # So exclude the last two characters.
  else:
    date_published = split_info[0][0:-2]
  
  # Format the date, relying on the helper function above.
  date_formatted = format_date(date_published)
  
  # Update the dataframe.
  activision_df.loc[index, 'author_info'] = date_formatted

In [None]:
activision_df.head()

Unnamed: 0,title,author_info,rating,pros,cons
1,"Overall, great place",09/21/2021,5,good people great benefits everyone games any ...,hard work rushed deadlines bosses don't work a...
2,"Fun, but overworked.",05/18/2022,3,Met some pretty incredible people. Can actuall...,Too many hours. Disorganized. Corprate doesn't...
3,Terrible Managers,04/11/2022,1,Will say the position is wfh but then when you...,They send you free stuff sometimes.
4,Great People - Weak Pay,03/03/2022,3,Awesome time to be in gaming. Working on big t...,Bad news headlines and lack of clear upward mo...
5,Poor Culture,02/03/2022,2,Gaming benefits if that's your thing,Lack of any company culture


# Zero-shot classification

Now, we're going to use zero-shot classification to classify our reviews
according to these axes: Culture and Values, Diversity and Inclusion, Work/Life Balance, Senior Management, Compensation and Benefits, and Career Opportunities. 

To do this, we'll first make a long list of all the sentences from our reviews. 

Then, we'll use BART from Hugging Face (https://huggingface.co/facebook/bart-large-mnli) to classify those sentences, putting them into appropriate lists! We will also keep them separated by negative and positive by assuming that whatever is under "pros" is positive, and whatever is under "cons" can be expected to be negative--this will be useful later on when we begin to use BERT for sentiment analysis.

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 10.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 40.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 3.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 29.0 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalli

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [None]:
reviewsPro = activision_df['pros']      # pros column
reviewsCon = activision_df['cons']      # cons column

Now that we have our data, we need to parse things into sentences. This is a bit tricky--there's no unified format for user reviews. Some people use periods to separate sentences. Some use hyphens. Some use both hyphens and periods, with additional hyphens. When the data is scraped from the Internet, newlines are obliterated, so we can't use those as clues.

Instead, we'll assume that if we encounter a period or an exclamation mark, we're ending a sentence. We'll also assume that if we encounter a hyphen with a space after it, we're ending a sentence. 

In [None]:
proSentences = []
conSentences = []

def parseSentencesFromReviewsColumn(reviewsColumn, reviewsList):
  # Iterate through the review at each row in the given column.
  for (columnName, columnData) in reviewsColumn.iteritems():
    currSent = ""
    prevChar = columnData[0]
    # columnData = one full positive review section. Loop thru char by char
    for character in columnData:
      # If we encounter a period, assume sentence, but only if the built string is
      # not spaces and not empty.
      if character == '.':
        if currSent.strip("-. ") and not currSent.strip(",. ").isspace():
          reviewsList.append(currSent.strip("-."))
        currSent = ""
      # If we encounter a "- ", assume sentence.
      elif character == ' ' and prevChar == '-':
        if currSent.strip("-. ") and not currSent.strip(",. ").isspace():
          reviewsList.append(currSent.strip(".-"))
      
        currSent = ""
    
      # Keep building the string and keep track of the prev char.
      currSent += character
      prevChar = character
      
    # Append whatever was left if it wasn't already caught
    if currSent.strip("-. ") and not currSent.strip(",. ").isspace():
      reviewsList.append(currSent.strip(".-"))

# Run on positive and negative reviews
parseSentencesFromReviewsColumn(reviewsPro, proSentences)
parseSentencesFromReviewsColumn(reviewsCon, conSentences)

# Sanity check
print(proSentences[:15])
print(conSentences[:15])

['good people great benefits everyone games any type of game you play', 'Met some pretty incredible people', ' Can actually see your effects on the game', 'Will say the position is wfh but then when you are hired will change their minds', 'Awesome time to be in gaming', ' Working on big titles is fun and carries some prestige', ' Really great people to work with and a lot of fun', "Gaming benefits if that's your thing", 'Great IP, passionate people, Bagel Mondays and Donut Fridays', 'good coworkers, and nice group events', 'You get free games snack and all the coffee you can drink', ' Sometimes they order pizza for the office', ' Woohoo -_', 'Fun atmosphere at times depending on the people during your shift', '+ Work from home opportunities']
["hard work rushed deadlines bosses don't work as much as developers", 'Too many hours', ' Disorganized', " Corprate doesn't care about those under them", 'They send you free stuff sometimes', 'Bad news headlines and lack of clear upward mobility 

In [None]:
import nltk
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from pprint import pprint

# Libraries needed to import/export files from/to drive
from google.colab import drive
drive.mount('/content/drive/')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Mounted at /content/drive/


In [None]:
lemmatizer = WordNetLemmatizer()

def tokenizeLemmetize(reviews):
  temp = []
  for sentence in reviews:
    tokens = word_tokenize(sentence)
    cleanedSentence = ""
    for token in tokens:
        lemmetized_word = lemmatizer.lemmatize(token)
        cleanedSentence += lemmetized_word + " "
    temp.append(cleanedSentence)
  return temp

In [44]:
proSentencesCleaned = tokenizeLemmetize(proSentences)
conSentencesCleaned = tokenizeLemmetize(conSentences)

# Sanity check
print(proSentencesCleaned[:15])
print(conSentencesCleaned[:15])

['good people great benefit everyone game any type of game you play ', 'Met some pretty incredible people ', 'Can actually see your effect on the game ', 'Will say the position is wfh but then when you are hired will change their mind ', 'Awesome time to be in gaming ', 'Working on big title is fun and carry some prestige ', 'Really great people to work with and a lot of fun ', "Gaming benefit if that 's your thing ", 'Great IP , passionate people , Bagel Mondays and Donut Fridays ', 'good coworkers , and nice group event ', 'You get free game snack and all the coffee you can drink ', 'Sometimes they order pizza for the office ', 'Woohoo -_ ', 'Fun atmosphere at time depending on the people during your shift ', '+ Work from home opportunity ']
["hard work rushed deadline boss do n't work a much a developer ", 'Too many hour ', 'Disorganized ', "Corprate doe n't care about those under them ", 'They send you free stuff sometimes ', 'Bad news headline and lack of clear upward mobility is 

Now we're all set up to classify our sentences. We'll sort them into lists according to their valence and category (the six categories are given under "Zero-shot classification)--12 lists in total.

In [45]:
# These are the possible categories of relevance we have defined.
# Diversity and inclusion = 1
# Culture and values = 2
# Work life balance = 3
# Senior management = 4
# Career opportunities = 5
# Compensation and benefits = 6
candidate_labels = ['diversity and inclusion', 'culture and values', 'work life balance', 'senior management', 'career opportunities', 'compensation and benefits']
pro1 = []
con1 = []
pro2 = []
con2 = []
pro3 = []
con3 = []
pro4 = []
con4 = []
pro5 = []
con5 = []
pro6 = []
con6 = []

pros = [pro1, pro2, pro3, pro4, pro5, pro6]
cons = [con1, con2, con3, con4, con5, con6]

# Let's be picky and assume that if the top value is lower than 0.4, the
# sentence is not relevant.

def sortReviewSentencesUsingZeroShot(sentenceList, labeledContainers):
  for sentence in sentenceList:
    cat = classifier(sentence, candidate_labels)
    if float(cat['scores'][0]) > 0.4:
      label = cat['labels'][0]
      if label == candidate_labels[0]:
        labeledContainers[0].append(sentence)
      elif label == candidate_labels[1]:
        labeledContainers[1].append(sentence)
      elif label == candidate_labels[2]:
        labeledContainers[2].append(sentence)
      elif label == candidate_labels[3]:
        labeledContainers[3].append(sentence)
      elif label == candidate_labels[4]:
        labeledContainers[4].append(sentence)
      elif label == candidate_labels[5]:
        labeledContainers[5].append(sentence)

In [46]:
# Each list item is written on a separate line: lists within the list are 
# separated with the token "[LISTSEP]". For the filepath, you need to 
# input a directory that already exists in your drive. (e.g., 
# /content/drive/MyDrive/folderYouCreated/fileNameYouWant)

def writeListOfListsToFile(listThingy, filePath):
  with open(filePath, 'w') as writefile:
    for oneList in listThingy:
      for element in oneList:
        writefile.write(element)
        writefile.write('\n')
      writefile.write("[LISTSEP]\n")

In [None]:
sortReviewSentencesUsingZeroShot(proSentencesCleaned, pros)
writeListOfListsToFile(pros, '/content/drive/MyDrive/compling_final/activisionPosClassified.txt')

In [None]:
sortReviewSentencesUsingZeroShot(conSentencesCleaned, cons)
writeListOfListsToFile(cons, '/content/drive/MyDrive/compling_final/activisionNegClassified.txt')

Now we'll print some classifiers and store them for our confusion matrix.

In [None]:
def printClassifiersForConfusion(sentenceList, howMany, filePath):
    # print(classifier(sentenceList[count], candidate_labels))
  with open(filePath, 'w') as writefile:
    for i in range(howMany):
      writefile.write(str(classifier(sentenceList[i], candidate_labels)))
      writefile.write("\n")

In [None]:
printClassifiersForConfusion(proSentencesCleaned, 25, '/content/drive/MyDrive/compling_final/activisionPosConfusion.txt')

In [None]:
printClassifiersForConfusion(conSentencesCleaned, 25, '/content/drive/MyDrive/compling_final/activisionNegConfusion.txt')