In [1]:
import os
import string
import numpy as np
import pandas as pd

print(os.listdir("./dataset/"))

['clean_test.csv', 'clean_train.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [3]:
df_train = pd.read_csv('./dataset/clean_train.csv')
df_test = pd.read_csv('./dataset/clean_test.csv')

In [4]:
df_train.columns

Index(['Place', 'status', 'job_title', 'summary', 'positives', 'negatives',
       'advice_to_mgmt', 'score_1', 'score_2', 'score_3', 'score_4', 'score_5',
       'score_6', 'overall', 'len_pos', 'len_neg', 'num_words_pos',
       'num_words_neg', 'clean_positives', 'clean_negatives'],
      dtype='object')

In [5]:
drop_col = ['Place', 'status', 'job_title', 'summary', 'positives', 'negatives', 'advice_to_mgmt',
            'score_1', 'score_2', 'score_3', 'score_4', 'score_5','score_6', 
            'len_pos', 'len_neg', 'num_words_pos', 'num_words_neg']

df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)
df_train.drop(columns='overall', inplace=True)

In [6]:
df_train.shape, df_test.shape

((30336, 2), (29272, 2))

In [7]:
def SIAscores(df):
    SIA = SentimentIntensityAnalyzer()
    label = ""
    
    def sentiment_score(score):
        if (score >= 0.67):
            label = "Very Happy"
        elif (score >= 0.33 and score < 0.67):
            label = "Happy"
        elif (score >= -0.33 and score < 0.33):
            label = "Neutral"
        elif (score >= -0.67 and score < -0.33):
            label = "Quite Unhappy"
        else:
            label = "Unhappy"
        return label
    
    df["positive_sentiments"] = df["clean_positives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["positive_score"] = [d.get('compound') for d in df["positive_sentiments"]]
    df["positive_label"] = df["positive_score"].apply(lambda x: sentiment_score(x))
    
    df["negative_sentiments"] = df["clean_negatives"].apply(lambda x: SIA.polarity_scores(str(x)))
    df["negative_score"] = [d.get('compound') for d in df["negative_sentiments"]]
    df["negative_label"] = df["negative_score"].apply(lambda x: sentiment_score(x))

SIAscores(df_train)
SIAscores(df_test)

In [8]:
drop_col = ['positive_sentiments', 'negative_sentiments']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [9]:
df_train.head(3)

Unnamed: 0,clean_positives,clean_negatives,positive_score,positive_label,negative_score,negative_label
0,people smart friendly,bureaucracy slow thing,0.7096,Very Happy,0.0,Neutral
1,food food food cafe main campus mtv alone mini...,work/life balance balance perk benefit illusio...,0.9806,Very Happy,0.0054,Neutral
2,software engineer among king hill google engin...,become large come grow pain bureaucracy slow r...,0.9971,Very Happy,-0.9325,Unhappy


# Combining Columns

In [10]:
def combining_columns(df, x, y, name):
    series = pd.Series(pd.concat([df[x], df[y]], ignore_index=True), name=name) 
    return series

features_train = combining_columns(df_train, 'clean_positives', 'clean_negatives', 'features')
scores_train = combining_columns(df_train, 'positive_score', 'negative_score', 'scores')
labels_train = combining_columns(df_train, 'positive_label', 'negative_label', 'labels')

features_test = combining_columns(df_test, 'clean_positives', 'clean_negatives', 'features')
scores_test = combining_columns(df_test, 'positive_score', 'negative_score', 'scores')
labels_test = combining_columns(df_test, 'positive_label', 'negative_label', 'labels')

In [11]:
df_train = pd.concat([features_train, scores_train, labels_train], axis=1)
df_test = pd.concat([features_test, scores_test, labels_test], axis=1)

In [12]:
df_train.shape, df_test.shape

((60672, 3), (58544, 3))

In [13]:
df_train.sample(3)

Unnamed: 0,features,scores,labels
54636,generic problem relate huge organization need ...,-0.0772,Neutral
9544,great fun although work extremely hard however...,0.9101,Very Happy
33760,perk pretty good salary way par could earn twi...,0.8689,Very Happy


In [14]:
df_test.sample(3)

Unnamed: 0,features,scores,labels
56854,work/life balance exactly lack stress constant...,0.7269,Very Happy
24306,lot cool opportunity get engage truly impactfu...,0.9682,Very Happy
7697,good benefit package adequate facility physica...,0.7783,Very Happy


In [15]:
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_test = df_test.sample(frac=1).reset_index(drop=True)

In [16]:
df_train.to_csv('dataset/sentiment_analysis_train.csv', index=False)
df_test.to_csv('dataset/sentiment_analysis_test.csv', index=False)