In [443]:
import pandas as pd
import string
from textblob import TextBlob
import string
import re
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /Users/niha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/niha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/niha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [444]:
# COMPANIES = ['airbnb', 'booking', 'tripadvisor']
company='tripadvisor'

## I. Data Loading

### Glassdoor

In [445]:
data_glassdoor = pd.read_csv(f"data/glassdoor/{company}/glassdoor_{company}.csv")

# Glassdoor api gives pros and cons separately. Joining them to form the text column for normalized analysis
data_glassdoor['text'] = data_glassdoor.pros + data_glassdoor.cons

### Indeed

In [446]:
data_indeed = pd.read_csv(f"data/indeed/{company}/indeed_{company}.csv")
data_indeed = data_indeed[data_indeed['language'] == "en"]

## II. Preprocessing the text data

### Helper functions

In [447]:
def preprocess(text):
    # text = ' '.join(text.astype(str))
    stopwords = nltk.corpus.stopwords.words('english')
    wnl = WordNetLemmatizer()
    text = re.sub('[%s]' % re.escape(string.punctuation), '', str(text).lower())
    tokens = re.sub(r'\d+', '', text).split()
    text = ' '.join([wnl.lemmatize(str(word)) for word in tokens if word not in stopwords and len(word) > 2])
    return text

In [448]:
# Review title is positive only if if the polarity of the title is non-negative
def get_title_sentiment(title):
    analysis = TextBlob(title).sentiment.polarity
    return 1 if analysis >= 0 else 0

def get_text_polarity(text):    
    return TextBlob(text).sentiment.polarity

# Review text is positive only if the polarity of the text is 0.3 or above in the positive scale
def get_text_sentiment(text):
    analysis = get_text_polarity(text)
    return 1 if analysis > 0.3 else 0

### Glassdoor

In [449]:
data_glassdoor['cleaned_title'] = data_glassdoor.title.apply(preprocess)
# data_glassdoor['cleaned_pros'] = data_glassdoor.pros.apply(preprocess)
# data_glassdoor['cleaned_cons'] = data_glassdoor.cons.apply(preprocess)
data_glassdoor['cleaned_text'] = data_glassdoor.text.apply(preprocess)

### Indeed

In [450]:
data_indeed['cleaned_title'] = data_indeed.title.apply(preprocess)
data_indeed['cleaned_text'] = data_indeed.text.apply(preprocess)

## III. Calculating the sentiment value for independent text columns

### Glassdoor

In [451]:
data_glassdoor['title_sentiment'] = data_glassdoor.cleaned_title.apply(get_title_sentiment)
data_glassdoor['text_sentiment'] = data_glassdoor.cleaned_text.apply(get_text_sentiment)

In [452]:
result_glassdoor = data_glassdoor[['id', 'title', 'text', 'title_sentiment', 'text_sentiment']]

In [453]:
result_glassdoor['title_sentiment'].value_counts()

title_sentiment
1    189
0     11
Name: count, dtype: int64

In [454]:
result_glassdoor['text_sentiment'].value_counts()

text_sentiment
1    115
0     85
Name: count, dtype: int64

### Indeed

In [455]:
data_indeed['title_sentiment'] = data_indeed.cleaned_title.apply(get_title_sentiment)
data_indeed['text_sentiment'] = data_indeed.cleaned_text.apply(get_text_sentiment)

In [456]:
result_indeed = data_indeed[['id', 'title', 'text', 'title_sentiment', 'text_sentiment']]

In [457]:
result_indeed['title_sentiment'].value_counts()

title_sentiment
1    118
0      6
Name: count, dtype: int64

In [458]:
result_indeed['text_sentiment'].value_counts()

text_sentiment
0    64
1    60
Name: count, dtype: int64

## IV. Combining the individual sentiment scores to create a 'Blend' score

### Glassdoor

In [459]:
result_glassdoor['final_sentiment'] = result_glassdoor.apply(lambda row: 'pos' if (row.title_sentiment * row.text_sentiment) == 1 else 'neg', axis=1)

In [460]:
result_glassdoor[(result_glassdoor.title_sentiment==0) & (result_glassdoor.text_sentiment==1)]

Unnamed: 0,id,title,text,title_sentiment,text_sentiment,final_sentiment
136,78047076,Hard worker,To help people out okNothing at all its good,0,1,neg
146,77506134,Average,Health and dental free coverageMicro managemen...,0,1,neg
188,73712656,Mean girls club,Great benefits! Awesome people that love’s adv...,0,1,neg


### Indeed

In [461]:
result_indeed['final_sentiment'] = result_indeed.apply(lambda row: 'pos' if (row.title_sentiment * row.text_sentiment) == 1 else 'neg', axis=1)

In [462]:
result_indeed[(result_indeed.title_sentiment==0) & (result_indeed.text_sentiment==1)]

Unnamed: 0,id,title,text,title_sentiment,text_sentiment,final_sentiment
10,1h3tarft9jjhs802,Poor management,They hire managers who haven’t sold and don’t ...,0,1,neg
47,1deksggsvh3st801,Incompetent management,Manager didn’t have a clue on how to manage pe...,0,1,neg


## V. Aggregating the sentiment scores from all data sources

In [463]:
result_glassdoor = result_glassdoor[['id', 'title', 'text', 'final_sentiment']]
result_glassdoor.shape

(200, 4)

In [464]:
result_indeed = result_indeed[['id', 'title', 'text', 'final_sentiment']]
result_indeed.shape

(124, 4)

In [465]:
company_sentiment = pd.concat([result_glassdoor,result_indeed],ignore_index=True)
company_sentiment.shape

(324, 4)

## VI. Exporting the aggregated data

In [466]:
company_sentiment.to_csv(f'data/aggregated/aggregated_sentiment_{company}.csv')