In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import string

print(os.listdir("./dataset/"))

['clean_test.csv', 'clean_train.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from wordcloud import WordCloud

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
df_train = pd.read_csv('./dataset/clean_train.csv')
df_test = pd.read_csv('./dataset/clean_test.csv')

In [4]:
drop_col = ['job_title', 'summary', 'positives', 'negatives','advice_to_mgmt', 
            'score_1', 'score_2', 'score_3', 'score_4', 'score_5','score_6', 
            'len_pos', 'len_neg', 'num_words_pos', 'num_words_neg']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)
df_train.drop(columns='overall', inplace=True)

In [5]:
df_train.head(3)

Unnamed: 0,Place,status,clean_positives,clean_negatives,clean_reviews
0,startup_1,Current Employee,people smart friendly,bureaucracy slow thing,people smart friendly bureaucracy slow thing
1,startup_1,Former Employee,food food food cafe main campus mtv alone mini...,work/life balance balance perk benefit illusio...,food food food cafe main campus mtv alone mini...
2,startup_1,Current Employee,software engineer among king hill google engin...,become large come grow pain bureaucracy slow r...,software engineer among king hill google engin...


In [6]:
def SIAscores(df):
    SIA = SentimentIntensityAnalyzer()
    label = ""
    
    def sentiment_score(score):
        if (score >= 0.67):
            label = "Very Happy"
        elif (score >= 0.33 and score < 0.67):
            label = "Happy"
        elif (score >= -0.33 and score < 0.33):
            label = "Neutral"
        elif (score >= -0.67 and score < -0.33):
            label = "Quite Unhappy"
        else:
            label = "Unhappy"
        return label
    
    df["positive_sentiments"] = df["clean_positives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["positive_score"] = [d.get('compound') for d in df["positive_sentiments"]]
    df["positive_label"] = df["positive_score"].apply(lambda x: sentiment_score(x))
    
    df["negative_sentiments"] = df["clean_negatives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["negative_score"] = [d.get('compound') for d in df["negative_sentiments"]]
    df["negative_label"] = df["negative_score"].apply(lambda x: sentiment_score(x))
    
    df["review_sentiments"] = df["clean_reviews"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["review_score"] = [d.get('compound') for d in df["review_sentiments"]]
    df["review_label"] = df["review_score"].apply(lambda x: sentiment_score(x))

SIAscores(df_train)
SIAscores(df_test)

In [7]:
drop_col = ['positive_sentiments', 'negative_sentiments','review_sentiments']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [8]:
df_train.sample(5)

Unnamed: 0,Place,status,clean_positives,clean_negatives,clean_reviews,positive_score,positive_label,negative_score,negative_label,review_score,review_label
441,startup_1,Former Employee,perk think,difficult get promote,perk think difficult get promote,0.0,Neutral,0.0258,Neutral,0.0258,Neutral
1160,startup_1,Former Employee,benefit plus four word,option work remote office,benefit plus four word option work remote office,0.4588,Happy,0.0,Neutral,0.4588,Happy
12506,startup_4,Current Employee,good work life balance achieve,less growth opportunity company,good work life balance achieve less growth opp...,0.4404,Happy,0.5898,Happy,0.7736,Very Happy
11472,startup_4,Former Employee,pay weekly overtime pay good,long hour flexibility bad management,pay weekly overtime pay good long hour flexibi...,0.2732,Neutral,-0.2732,Neutral,0.0,Neutral
14220,startup_4,Former Employee,helpful people free beverage,unrealistic deadline pointless meeting project...,helpful people free beverage unrealistic deadl...,0.7269,Very Happy,0.0,Neutral,0.7269,Very Happy


In [9]:
df_test.sample(5)

Unnamed: 0,Place,status,clean_positives,clean_negatives,clean_reviews,positive_score,positive_label,negative_score,negative_label,review_score,review_label
22323,startup_6,Current Employee,solid work life balance stable company,pay market promotion velocity inconsistent ove...,solid work life balance stable company pay mar...,0.4215,Happy,-0.2732,Neutral,0.1779,Neutral
6396,startup_4,Former Employee,great company reputable organization,work life balance mandatory overtime,great company reputable organization work life...,0.6249,Happy,0.0772,Neutral,0.6597,Happy
9787,startup_4,Former Employee,decent benefit nice people good location,public holiday tough work seattle centris,decent benefit nice people good location publi...,0.8271,Very Happy,0.296,Neutral,0.872,Very Happy
564,startup_1,Former Employee,usual top,many people company,usual top many people company,0.2023,Neutral,0.0,Neutral,0.2023,Neutral
2095,startup_1,Former Employee,good pay supportive colleague nice environment...,none moment google big good company pleasure w...,good pay supportive colleague nice environment...,0.9001,Very Happy,0.765,Very Happy,0.9559,Very Happy


In [10]:
df_train.to_csv('dataset/sentiment_analysis_train.csv', index=False)
df_test.to_csv('dataset/sentiment_analysis_test.csv', index=False)