In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import string

print(os.listdir("./dataset/"))

['clean_test.csv', 'clean_train.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from wordcloud import WordCloud

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
df_train = pd.read_csv('./dataset/clean_train.csv')
df_test = pd.read_csv('./dataset/clean_test.csv')

In [4]:
drop_col = ['job_title', 'summary', 'positives', 'negatives','advice_to_mgmt', 
            'score_1', 'score_2', 'score_3', 'score_4', 'score_5','score_6', 
            'len_pos', 'len_neg', 'num_words_pos', 'num_words_neg']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)
df_train.drop(columns='overall', inplace=True)

In [5]:
df_train.head(3)

Unnamed: 0,Place,status,clean_positives,clean_negatives,clean_reviews
0,startup_1,Current Employee,people smart friendly,bureaucracy slow thing,people smart friendly bureaucracy slow thing
1,startup_1,Former Employee,food food food cafe main campus mtv alone mini...,work/life balance balance perk benefit illusio...,food food food cafe main campus mtv alone mini...
2,startup_1,Current Employee,software engineer among king hill google engin...,become large come grow pain bureaucracy slow r...,software engineer among king hill google engin...


In [6]:
def SIAscores(df):
    SIA = SentimentIntensityAnalyzer()
    label = ""
    
    def sentiment_score(score):
        if (score >= 0.67):
            label = "Very Happy"
        elif (score >= 0.33 and score < 0.67):
            label = "Happy"
        elif (score >= -0.33 and score < 0.33):
            label = "Neutral"
        elif (score >= -0.67 and score < -0.33):
            label = "Quite Unhappy"
        else:
            label = "Unhappy"
        return label
    
    df["positive_sentiments"] = df["clean_positives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["positive_compound"] = [d.get('compound') for d in df["positive_sentiments"]]
    df["positive_score"] = df["positive_compound"].apply(lambda x: sentiment_score(x))
    
    df["negative_sentiments"] = df["clean_negatives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["negative_compound"] = [d.get('compound') for d in df["negative_sentiments"]]
    df["negative_score"] = df["negative_compound"].apply(lambda x: sentiment_score(x))
    
    df["review_sentiments"] = df["clean_reviews"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["review_compound"] = [d.get('compound') for d in df["review_sentiments"]]
    df["review_score"] = df["review_compound"].apply(lambda x: sentiment_score(x))

SIAscores(df_train)
SIAscores(df_test)

In [7]:
df_train.columns

Index(['Place', 'status', 'clean_positives', 'clean_negatives',
       'clean_reviews', 'positive_sentiments', 'positive_compound',
       'positive_score', 'negative_sentiments', 'negative_compound',
       'negative_score', 'review_sentiments', 'review_compound',
       'review_score'],
      dtype='object')

In [8]:
drop_col = ['positive_sentiments', 'positive_compound', 'negative_sentiments', 
            'negative_compound','review_sentiments', 'review_compound']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [9]:
df_train.sample(5)

Unnamed: 0,Place,status,clean_positives,clean_negatives,clean_reviews,positive_score,negative_score,review_score
27294,startup_6,Current Employee,great environment almost job/position/role/tec...,huge company many people lot competition beuro...,great environment almost job/position/role/tec...,Very Happy,Neutral,Very Happy
11112,startup_4,Former Employee,good salary interest technology problem space,middle management director senior manager typi...,good salary interest technology problem space ...,Happy,Unhappy,Quite Unhappy
15719,startup_5,Current Employee,treat like human well-fed corporate structure ...,base salary little comparable company young em...,treat like human well-fed corporate structure ...,Very Happy,Neutral,Very Happy
22701,startup_6,Current Employee,culture transparency respect diversity opportu...,come individual company,culture transparency respect diversity opportu...,Very Happy,Neutral,Very Happy
909,startup_1,Current Employee,pro good,downside work,pro good downside work,Happy,Neutral,Neutral


In [None]:
df_train.to_csv('dataset/sentiment_analysis_train.csv', index=False)
df_test.to_csv('dataset/_test.csv', index=False)