In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import string

print(os.listdir("./dataset/"))

['clean_test.csv', 'clean_train.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from wordcloud import WordCloud

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
df_train = pd.read_csv('./dataset/clean_train.csv')
df_test = pd.read_csv('./dataset/clean_test.csv')

In [4]:
drop_col = ['job_title', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5', 'score_6']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [5]:
df_train.head(3)

Unnamed: 0,Place,status,summary,positives,negatives,advice_to_mgmt,overall,len_pos,len_neg,num_words_pos,num_words_neg,clean_positives,clean_negatives,clean_reviews
0,startup_1,Current Employee,Best Company to work for,People are smart and friendly,Bureaucracy is slowing things down,,5,29,34,5,5,people smart friendly,bureaucracy slow thing,people smart friendly bureaucracy slow thing
1,startup_1,Former Employee,"Moving at the speed of light, burn out is inev...","1) Food, food, food. 15+ cafes on main campus ...",1) Work/life balance. What balance? All those ...,1) Don't dismiss emotional intelligence and ad...,5,1048,2403,155,398,food food food cafe main campus mtv alone mini...,work/life balance balance perk benefit illusio...,food food food cafe main campus mtv alone mini...
2,startup_1,Current Employee,Great balance between big-company security and...,"* If you're a software engineer, you're among ...","* It *is* becoming larger, and with it comes g...",Keep the focus on the user. Everything else wi...,5,3634,1064,629,176,software engineer among king hill google engin...,become large come grow pain bureaucracy slow r...,software engineer among king hill google engin...


In [6]:
def SIAscores(df):
    SIA = SentimentIntensityAnalyzer()
    df["pos_sentiments"] = df["clean_positives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["neg_sentiments"] = df["clean_negatives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["review_sentiments"] = df["clean_reviews"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["review_negative"] = [d.get('neg') for d in df["review_sentiments"]]
    df["review_neutral"] = [d.get('neu') for d in df["review_sentiments"]]
    df["review_positive"] = [d.get('pos') for d in df["review_sentiments"]]
    df["review_compound"] = [d.get('compound') for d in df["review_sentiments"]]

In [7]:
SIAscores(df_train)
SIAscores(df_test)

In [8]:
df_train.sample(5)

Unnamed: 0,Place,status,summary,positives,negatives,advice_to_mgmt,overall,len_pos,len_neg,num_words_pos,...,clean_positives,clean_negatives,clean_reviews,pos_sentiments,neg_sentiments,review_sentiments,review_negative,review_neutral,review_positive,review_compound
6187,startup_4,Current Employee,"Great Place, Great People, Great Culture","- fast moving, if you are excited about techno...","- frugal, if you like sprawling offsite campus...",keep up the startup type culture,5,629,427,112,...,fast move excited technology right place flat ...,frugal like sprawl offsite campus lush green g...,fast move excited technology right place flat ...,"{'neg': 0.089, 'neu': 0.605, 'pos': 0.306, 'co...","{'neg': 0.0, 'neu': 0.764, 'pos': 0.236, 'comp...","{'neg': 0.056, 'neu': 0.664, 'pos': 0.28, 'com...",0.056,0.664,0.28,0.9823
22917,startup_6,Current Employee,Solid place to work,- Benefits!! - Good work/life balance - Intere...,- Bureaucratic - Development processes can be ...,,4,65,93,10,...,benefit good work/life balance interesting pro...,bureaucratic development process slow internal...,benefit good work/life balance interesting pro...,"{'neg': 0.0, 'neu': 0.317, 'pos': 0.683, 'comp...","{'neg': 0.307, 'neu': 0.693, 'pos': 0.0, 'comp...","{'neg': 0.137, 'neu': 0.485, 'pos': 0.379, 'co...",0.137,0.485,0.379,0.6705
24088,startup_6,Former Employee,Great Company,"Great Company fun place to work, managers coul...","If you get a bad manager, you have on recourse.",,5,57,47,10,...,great company fun place work manager could better,get bad manager recourse,great company fun place work manager could bet...,"{'neg': 0.0, 'neu': 0.327, 'pos': 0.673, 'comp...","{'neg': 0.538, 'neu': 0.462, 'pos': 0.0, 'comp...","{'neg': 0.161, 'neu': 0.367, 'pos': 0.472, 'co...",0.161,0.367,0.472,0.7783
22501,startup_6,Current Employee,Decent Place to Work,"Little/no stress, work/life balance, decent pa...","Sometimes politics gets in the way, promotion ...",,5,83,87,9,...,little/no stress work/life balance decent pay ...,sometimes politics get way promotion mostly ba...,little/no stress work/life balance decent pay ...,"{'neg': 0.323, 'neu': 0.462, 'pos': 0.215, 'co...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.183, 'neu': 0.696, 'pos': 0.122, 'co...",0.183,0.696,0.122,-0.1027
24401,startup_6,Former Employee,research program,great program. You will learn a lot. Fun envir...,didn't find many cons at this program,,5,62,37,12,...,great program learn lot fun environment,find many con program,great program learn lot fun environment find m...,"{'neg': 0.0, 'neu': 0.351, 'pos': 0.649, 'comp...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...","{'neg': 0.0, 'neu': 0.519, 'pos': 0.481, 'comp...",0.0,0.519,0.481,0.8126


# Testing

In [9]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    print("{} {}".format(sentence, str(score)))
    if (score['compound'] >= 0.67):
        print("Very Happy")
    elif (score['compound'] >= 0.33 and score['compound'] < 0.67):
        print("Happy")
    elif (score['compound'] >= -0.33 and score['compound'] < 0.33):
        print("Neutral")
    elif (score['compound'] >= -0.67 and score['compound'] < -0.33):
        print("Quite Unhappy")
    else:
        print("Unhappy")

In [10]:
sentiment_analyzer_scores("I am very HAPPY!!")

I am very HAPPY!! {'neg': 0.0, 'neu': 0.274, 'pos': 0.726, 'compound': 0.7438}
Very Happy
