# Metrics and Keywords

In [68]:
metrics = [
    'insurance', 
    'safety', 
    'balance', 
    'retirement', 
    'culture', 
    'racism', 
    'sexism', 
    'ageism', 
    'benefits', 
    'opportunities', 
    'privacy', 
    'resources']

keywords = {
    'insurance': ['insur', 'health', 'coverage', 'sick', 'medical'],
    'safety': ['safe', 'drug', 'alcohol', 'violence', 'violent', 'hazard', 'working conditions'],
    'balance': ['work life balance', 'worklife balance', 'work-life balance', 'work/life balance', 'work and life balance', 'burnout', 'burn out', 'stress', 'time management'],
    'retirement': ['retire', '10-99R', 'saving', 'long-term', 'long term'],
    'culture': ['culture', 'people', 'colleague', 'value', 'trust', 'atmosphere', 'collaborat'],
    'racism': ['racis', 'my race', 'his race', 'her race', 'their race', 'prejudice', 'racial', 'black', 'white', 'indian', 'Indian', 'asian', 'Asian', 'minorit'],
    'sexism': ['gender', 'male', 'female'],
    'ageism': ['age', 'retire'],
    'benefits': ['benefit', 'cash', 'pay', 'compensat', 'salar', 'time off', 'day off', 'days off', 'bonus'],
    'opportunities': ['opportunit', 'project', 'collaborat', 'grow', 'skill', 'advanc', 'dream'],
    'privacy': ['priva', 'personal', 'bag', 'clothe'],
    'resources': ['resource', 'train', 'benefit', 'skill', 'learn', 'mentor', 'coach']
}

# Helper Methods

In [69]:
import numpy as np
import pandas as pd

In [190]:
def metric_search(text, column, index, words):
    """ Classifies text based on keywords for a metric.
    
    Args:
        text::[pd.Series]
            The text column to analyze.
        column::[pd.Series]
            A pointer to the formal classification column.
        index::[int]
            The index to perform the search at, within the columns.
        words::[list]
            The list of keywords affiliated with the metric to analyze for.
            
    Return:
        0 if none of the keywords were found.
        1 otherwise.
    """
    
    value = column[index]
    if column[index] == 1:
        return int(True)
    else:
        return int(any([text[index].lower().find(word) >= 0 for word in words]))

def classify_with_keywords(reviews_df, metric):
    """ Classifies the metrics for a given dataset into their own columns, using keywords.
    
    Args:
        reviews_df::[pd.DataFrame]
            The reviews table to analyze.
        metric::[str]
            The specific metric to base the classification on.
    """
    
    current_keywords = keywords[metric]
    
    pros_text = reviews_df['pros']
    current_pros_column = reviews_df[f'{metric}_pros']
    
    cons_text = reviews_df['cons']
    current_cons_column = reviews_df[f'{metric}_cons']
    
    for i in range(len(reviews_df)):
        current_pros_column[i] = metric_search(pros_text, current_pros_column, i, current_keywords)
        current_cons_column[i] = metric_search(cons_text, current_cons_column, i, current_keywords)
    
    print(f'Completed {metric}.')

# Applying the Classifier

In [26]:
reviews = pd.read_csv("all_reviews_classified_BERT_separated.csv", header = 0, sep = ";")
reviews.head()

Unnamed: 0,Company,review_title,pros,cons,text,score_pros,score_cons,score_combined,insurance_pros,safety_pros,...,balance_cons,retirement_cons,culture_cons,racism_cons,sexism_cons,ageism_cons,benefits_cons,opportunities_cons,privacy_cons,resources_cons
0,ExxonMobil,Great Company Overall,Great work environment Great benefits Pretty g...,I have not experienced anything negative so far,{ Pros. Great work environment Great benefits ...,0.95,0.87,0.97,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ExxonMobil,working on energy R&D,"Outstanding colleagues, working on high impact...",Difficult industry business environment curren...,"{ Pros. Outstanding colleagues, working on hig...",0.32,0.65,0.77,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ExxonMobil,Flexibility,The flexibility and the nature of working ther...,No downside. PERIOD. Such a great place to join.,{ Pros. The flexibility and the nature of work...,0.86,0.93,0.96,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ExxonMobil,I can only be thankful,I am achieving my dreams in partnership with t...,"It is hard times right now. But for me, it's w...",{ Pros. I am achieving my dreams in partnershi...,0.77,0.91,0.93,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ExxonMobil,Decent company to work for,"Competitive pay, structured benefits, and job ...",Even if you worked your tail off the whole yea...,"{ Pros. Competitive pay, structured benefits, ...",0.7,0.72,0.9,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
for metric in metrics:
    classify_with_keywords(reviews, metric)
    
reviews.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_pros_column[i] = metric_search(pros_text, current_pros_column, i, current_keywords)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  current_cons_column[i] = metric_search(cons_text, current_cons_column, i, current_keywords)


Completed insurance.
Completed safety.
Completed balance.
Completed retirement.
Completed culture.
Completed racism.
Completed sexism.
Completed ageism.
Completed benefits.
Completed opportunities.
Completed privacy.
Completed resources.


Unnamed: 0,Company,review_title,pros,cons,text,score_pros,score_cons,score_combined,insurance_pros,safety_pros,...,balance_cons,retirement_cons,culture_cons,racism_cons,sexism_cons,ageism_cons,benefits_cons,opportunities_cons,privacy_cons,resources_cons
0,ExxonMobil,Great Company Overall,Great work environment Great benefits Pretty g...,I have not experienced anything negative so far,{ Pros. Great work environment Great benefits ...,0.95,0.87,0.97,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ExxonMobil,working on energy R&D,"Outstanding colleagues, working on high impact...",Difficult industry business environment curren...,"{ Pros. Outstanding colleagues, working on hig...",0.32,0.65,0.77,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ExxonMobil,Flexibility,The flexibility and the nature of working ther...,No downside. PERIOD. Such a great place to join.,{ Pros. The flexibility and the nature of work...,0.86,0.93,0.96,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ExxonMobil,I can only be thankful,I am achieving my dreams in partnership with t...,"It is hard times right now. But for me, it's w...",{ Pros. I am achieving my dreams in partnershi...,0.77,0.91,0.93,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ExxonMobil,Decent company to work for,"Competitive pay, structured benefits, and job ...",Even if you worked your tail off the whole yea...,"{ Pros. Competitive pay, structured benefits, ...",0.7,0.72,0.9,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
for metric in metrics:
#     shape = reviews[reviews[f'{metric}'] == 1].shape
    shape_p = reviews[reviews[f'{metric}_pros'] == 1].shape
    shape_c = reviews[reviews[f'{metric}_cons'] == 1].shape
    
#     print(f'Shape of reviews predicted with {metric}_pros metric: {shape}')
    print(f'Shape of reviews predicted with {metric}_pros metric: {shape_p}')
    print(f'Shape of reviews predicted with {metric}_cons metric: {shape_c}\n')

Shape of reviews predicted with insurance_pros metric: (336, 32)
Shape of reviews predicted with insurance_cons metric: (133, 32)

Shape of reviews predicted with safety_pros metric: (345, 32)
Shape of reviews predicted with safety_cons metric: (189, 32)

Shape of reviews predicted with balance_pros metric: (777, 32)
Shape of reviews predicted with balance_cons metric: (502, 32)

Shape of reviews predicted with retirement_pros metric: (141, 32)
Shape of reviews predicted with retirement_cons metric: (138, 32)

Shape of reviews predicted with culture_pros metric: (2260, 32)
Shape of reviews predicted with culture_cons metric: (1346, 32)

Shape of reviews predicted with racism_pros metric: (19, 32)
Shape of reviews predicted with racism_cons metric: (93, 32)

Shape of reviews predicted with sexism_pros metric: (11, 32)
Shape of reviews predicted with sexism_cons metric: (56, 32)

Shape of reviews predicted with ageism_pros metric: (1009, 32)
Shape of reviews predicted with ageism_cons me