# **0) Imports**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pathlib
import glob
import os

!git clone https://github.com/loier13/IEOR235.git

# set option below so Pandas dataframe can output readable text, not truncated
pd.set_option('display.max_colwidth', 0)

Cloning into 'IEOR235'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 40 (delta 10), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (40/40), done.


# **1) Open reviews**

In [None]:
all_reviews = pd.read_csv('IEOR235/all_reviews/all_reviews.csv', sep = ';')

def clean_reviews(data):
  data['pros'] = data['pros'].astype(str).apply(lambda x: '. '.join(x.split('\n')))
  data['cons'] = data['cons'].astype(str).apply(lambda x: '. '.join(x.split('\n')))
  data.drop_duplicates(inplace = True)
  data = data[['pros']].reset_index()
  data.columns = ['Id', 'text']
  return data

all_reviews = clean_reviews(all_reviews)
display(all_reviews.head())
display(all_reviews.info())

Unnamed: 0,Id,text
0,0,Great work environment Great benefits Pretty good work/life balance.
1,1,"Outstanding colleagues, working on high impact problems."
2,2,The flexibility and the nature of working there is more like a family environment. I love it how we all look out for each other..
3,3,"I am achieving my dreams in partnership with the company. Very thankful. It is hard work, but when wasn't it supposed to be that way if you pursue your dreams.."
4,4,"Competitive pay, structured benefits, and job satisfaction."


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9421 entries, 0 to 9420
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      9421 non-null   int64 
 1   text    9421 non-null   object
dtypes: int64(1), object(1)
memory usage: 147.3+ KB


None

# **2) Naive topic detection**

Topic detection by keywords.

In [None]:
E_keywords = ['renewable', 'recycl', 'reuse', 'compost', 'recovery', 'ecocide', 'bio', 'carbon', 'forest',
 'sustainable', 'renewable', 'pollut', 'emissions', 'green', 'co2', 'ch4', 'n2o',
 'hfcs', 'pfcs', 'sf6', 'nf3', 'cfc-11', 'nox', 'sox', 'warming', 'climate',
 'waste', 'garbage', 'trash', 'disposal', 'landfill', 'chemicals', 'acidification', 'fossil',
 'eutrophication', 'environmental', 'consumption', 'water', 'resource', 'ecosystem', 'ecology', 'incineration',
 'ozone', 'natural', 'solar', 'biomass', 'air', 'soil', 'dioxide', 'footprint', 'geoengineering']

S_keywords = ['labor', 'health', 'safe', 'human', 'standards', 'quality', 'life', 'privacy', 'private', 'responsib', 'insur', 'risk', 'care', 'opportunit', 'resource']

G_keywords = ['corrupt', 'management', 'board', 'pay', 'fair', 'owner', 'account', 'ethics', 'competit', 'practice', 'stable', 'stabilit', 'system', 'transparen']

def naive_topic_detection(data, topic, keywords):
  """
  For this prototype we have a naive topic detection algorithms by keywords. We may have a suboptimal precision and recall.
  """
  output = data.copy()
  output[f'{topic}_naive'] = data['text'].apply(lambda x: any([x.lower().find(word) >=0 for word in keywords])).astype(int)
  return output

all_reviews_E = naive_topic_detection(all_reviews, 'E', E_keywords)
all_reviews_S = naive_topic_detection(all_reviews, 'S', S_keywords)
all_reviews_G = naive_topic_detection(all_reviews, 'G', G_keywords)
display(all_reviews_E.head())

Unnamed: 0,Id,text,E_naive
0,0,Great work environment Great benefits Pretty good work/life balance.,0
1,1,"Outstanding colleagues, working on high impact problems.",0
2,2,The flexibility and the nature of working there is more like a family environment. I love it how we all look out for each other..,0
3,3,"I am achieving my dreams in partnership with the company. Very thankful. It is hard work, but when wasn't it supposed to be that way if you pursue your dreams..",0
4,4,"Competitive pay, structured benefits, and job satisfaction.",0


# **3) Manual annotation**

We voluntarily separate E, S and G instead of multiclass labeling in order to implement different better trained classifiers on these overlapping classes. So we proceed to E, S and G labelling in the following sections.

In [None]:
%%capture --no-display
!pip install superintendent

from superintendent.distributed import ClassLabeller

### **a. Environment**

In [None]:
all_reviews_E['E'] = np.nan

active_E = all_reviews_E[all_reviews_E.E_naive == 1]

widget_E = ClassLabeller(
    features=active_E[active_E['E'].isnull()]['text'].tolist(),
    options=[
        "E", "Non-E"
    ]
)

widget_E

ClassLabeller(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.0),)), Box(chil…

In [None]:
active_E['E'] = active_E['E'].map({"E":1, "non-E": 0})
active_E.head()

Unnamed: 0,Id,text,E_naive,E
0,24,"Huge resources, meritocracy, very intriguin intelligent and capable people.",1,0
1,27,Paid well Competitive environment New opportunities (job rotation every 2-3 years) Values training and providing resources for personal/job growth.,1,0
2,54,"Competitive pay, resources, benefits, and coworkers..",1,0
3,72,"Top-caliber co-workers and very interesting opportunities. Compensation is fair and (until recently) my job felt extremely secure, as long as I performed satisfactorily..",1,0
4,81,"- Some peers make this place bearable through trauma bonding - The campus is beautiful and you can take froyo breaks - If you are lucky enough to be at one of the Houston sites, you do not have state income tax - You get to learn if you like oil, gas, & chemicals or not.",1,0


In [None]:
non_E = all_reviews_E[all_reviews_E.E_naive == 0]
non_E['E'] = 0
final_E = pd.concat([non_E, active_E])
final_E.to_csv('reviews_E_labeled.csv', sep = ';')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


The final dataset final_E is saved and will be used in the next notebook for classification purposes.

### **b. Social**

In [None]:
all_reviews_S['S'] = np.nan

active_S = all_reviews_S[all_reviews_S.S_naive == 1]

widget_S = ClassLabeller(
    features=active_S[active_S['S'].isnull()]['text'].tolist(),
    options=[
        "S", "Non-S"
    ]
)

widget_S

ClassLabeller(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.0),)), Box(chil…

In [None]:
active_S['S'] = active_S['S'].map({"S":1, "non-S": 0})
active_S.head()

Unnamed: 0,Id,text,S_naive,S
0,0,Great work environment Great benefits Pretty good work/life balance.,1,1.0
1,5,"Pension, health insurance benefits, rotating 4 on 4 off schedule.",1,1.0
2,8,"People, Assignment opportunities, Benefits, Salary.",1,1.0
3,9,Great place to work depending on career path and group placed in the company; Competitive benefits.,1,1.0
4,15,"Had some stability, consistency and fun in the past. I hope we're on the verge of throwing out the current Sr. management. Exciting to be part of the resistance within a collapsing empire..",1,0.0


In [None]:
non_S = all_reviews_S[all_reviews_S.S_naive == 0]
non_S['S'] = 0
final_S = pd.concat([non_S, active_S])
final_S.to_csv('reviews_S_labeled.csv', sep = ';')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### **c. Governance**

In [None]:
all_reviews_G['G'] = np.nan

active_G = all_reviews_G[all_reviews_G.G_naive == 1]

widget_G = ClassLabeller(
    features=active_G[active_G['G'].isnull()]['text'].tolist(),
    options=[
        "G", "Non-G"
    ]
)

widget_G

ClassLabeller(children=(HBox(children=(FloatProgress(value=0.0, description='Progress:', max=1.0),)), Box(chil…

In [None]:
active_G['G'] = active_G['G'].map({"G":1, "non-G": 0})
active_G.head()

Unnamed: 0.1,Unnamed: 0,Id,text,G_naive,G
0,4,4,"Competitive pay, structured benefits, and job satisfaction.",1,0.0
1,9,9,Great place to work depending on career path and group placed in the company; Competitive benefits.,1,0.0
2,20,20,Great benefits and pay. Great parental time off and consideration for sick leave.,1,0.0
3,21,21,"A lot of the people you work with are delightful, willing to teach, and genuinely want to be productive helpful employees. A lot of employees are willing to mentor and the career opportunities at the corporation are fantastic. Pay was great, benefits too. Best I've seen in the energy industry..",1,0.0
4,27,27,Paid well Competitive environment New opportunities (job rotation every 2-3 years) Values training and providing resources for personal/job growth.,1,0.0


In [None]:
non_G = all_reviews_G[all_reviews_G.G_naive == 0]
non_G['G'] = 0
final_G = pd.concat([non_G, active_G])
final_G.to_csv('reviews_G_labeled.csv', sep = ';')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
