In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("./dataset/"))

['sample_submission.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from wordcloud import WordCloud
import matplotlib.pyplot as plt

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')
df_sample = pd.read_csv('./dataset/sample_submission.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30336 entries, 0 to 30335
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              30336 non-null  int64  
 1   Place           30336 non-null  object 
 2   location        19082 non-null  object 
 3   date            30336 non-null  object 
 4   status          30336 non-null  object 
 5   job_title       30336 non-null  object 
 6   summary         30284 non-null  object 
 7   positives       30336 non-null  object 
 8   negatives       30336 non-null  object 
 9   advice_to_mgmt  17059 non-null  object 
 10  score_1         27150 non-null  float64
 11  score_2         24286 non-null  float64
 12  score_3         27167 non-null  float64
 13  score_4         27145 non-null  float64
 14  score_5         26851 non-null  float64
 15  score_6         30336 non-null  int64  
 16  overall         30336 non-null  float64
dtypes: float64(6), int64(2), object

In [5]:
df_train.sample(5)

Unnamed: 0,ID,Place,location,date,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6,overall
23716,52982,startup_6,,"Aug 9, 2016",Former Employee,Anonymous Employee,Analyst,Management was always upfront and honest about...,Felt that there was a lot of micro managing fr...,Need to better take the time to understand the...,3.0,5.0,4.0,4.0,3.0,0,4.0
16442,36938,startup_2,,"Sep 24, 2018",Current Employee,Anonymous Employee,Senior Visual Designer,"Great Benefits, cool people, Flexible hours","Politics sometimes, but thats everywhere",,4.0,4.0,4.0,4.0,3.0,0,3.0
4753,10726,startup_4,"Romeoville, IL","Apr 23, 2018",Current Employee,Transportation,Great Warehouse Job,-They care for their associates -They have gre...,-Some managers are not great leaders,-Listen more and communicate better with your ...,5.0,5.0,5.0,5.0,4.0,0,4.0
27512,61459,startup_6,"Washington, DC","Sep 4, 2008",Current Employee,Senior Consultant,Geeks are being replaced by the,"If you happen to be deemed a ""rock star"" for a...",Extremely likely that the quality of your work...,"Not much, I'd actually fire most of them.",4.0,,4.0,4.5,1.5,2,3.0
11341,25558,startup_4,Chennai (India),"Nov 15, 2018",Current Employee,BOLD Operations Manager,Challenging yet inflexible,- Competitive culture - Good leeway to work yo...,- Erratic timings - Stock option takes too lon...,,2.0,4.0,4.0,4.0,3.0,0,4.0


In [6]:
df_train.shape, df_test.shape

((30336, 17), (29272, 16))

In [7]:
df_train.isnull().sum()

ID                    0
Place                 0
location          11254
date                  0
status                0
job_title             0
summary              52
positives             0
negatives             0
advice_to_mgmt    13277
score_1            3186
score_2            6050
score_3            3169
score_4            3191
score_5            3485
score_6               0
overall               0
dtype: int64

In [8]:
col = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
for c in col:
    df_train[c].fillna(df_train[c].dropna().median(), inplace=True)
    df_test[c].fillna(df_train[c].dropna().median(), inplace=True)

col1 = ['negatives', 'summary', 'advice_to_mgmt']
for c in col1:
    df_train[c].fillna('', inplace=True)
    df_test[c].fillna('', inplace=True)

In [9]:
drop_col = ['ID', 'location', 'date']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [10]:
df_train.shape, df_test.shape

((30336, 14), (29272, 13))

In [11]:
df_train['Place'].value_counts()

startup_4    11758
startup_6     8172
startup_2     5863
startup_1     3468
startup_5      698
startup_3      377
Name: Place, dtype: int64

In [12]:
df_train['status'].value_counts()

Current Employee     19087
Former Employee      11249
Name: status, dtype: int64

In [13]:
df_train['overall'].value_counts()

4.0    10688
3.0     9510
5.0     5975
2.0     3531
1.0      632
Name: overall, dtype: int64

In [14]:
OEncoder  = OrdinalEncoder()
df_train['Place'] = OEncoder.fit_transform(df_train[['Place']])
df_train['status'] = OEncoder.fit_transform(df_train[['status']])

df_test['Place'] = OEncoder.fit_transform(df_test[['Place']])
df_test['status'] = OEncoder.fit_transform(df_test[['status']])

In [15]:
df_train.isnull().sum().any(), df_test.isnull().sum().any()

(False, False)

In [16]:
df_train.sample(5)

Unnamed: 0,Place,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6,overall
21670,1.0,1.0,Anonymous Employee,"Ops quality engineering , supplier quality","Can influence the product, supplier quality","Work life balance, stress, too many meetings",Stress,1.0,1.0,3.0,2.0,1.0,1,4.0
19989,1.0,1.0,Mac Genius,"Dynamic, fast-paced, but poorly managed.",Apple prepares their employees for success wit...,The way Apple cultivates a management team is ...,Realize the history between managers on a stor...,4.0,1.0,3.0,5.0,1.0,2,3.0
25914,5.0,1.0,Senior Program Manager,Some great people. Some below avg. Very much d...,"Leadership position (in some products), shapin...",The worse internal office politics one can exp...,Buy companies like hell and don't merge then b...,5.0,1.0,4.0,5.0,2.0,0,5.0
17349,1.0,0.0,Technical Specialist,Great place,The people are so helpful and unique! The tran...,There aren't too many cons for working with ap...,,5.0,5.0,5.0,5.0,5.0,1,5.0
13128,3.0,0.0,Operations Manager,Operations Manager,Very Dynamic role and truly customer obsessed....,Long working hours No formal training in the O...,Review working hours as the culture is to work...,2.0,4.0,5.0,5.0,4.0,1,3.0


# Positive and Negative

In [17]:
def Review_len(df):
    df['len_pos'] = df['positives'].str.len()
    df['len_neg'] = df['negatives'].str.len()

In [18]:
Review_len(df_train)
Review_len(df_test)

In [20]:
df_train.sample(3)

Unnamed: 0,Place,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6,overall,len_pos,len_neg
6963,3.0,0.0,"Senior Manager, Program Management",Great for many--not good for some,"Fast moving, growing, innovative company full ...",Demands can be brutal--hard to achieve work li...,Share the profits with the employees. Take car...,2.0,3.0,4.0,3.0,3.0,0,4.0,289,273
27359,5.0,0.0,Director,Social contract btwn employers and employees i...,work with smart people good benefits,"layoffs, politics, cronyism, poor facilities, ...",make sure you work in a core area like windows...,3.0,4.0,2.0,3.0,1.5,1,2.0,36,61
1899,0.0,1.0,Anonymous Employee,Great!,Good perks and learning environment,everything that the company provides is great,great mentors and managers,3.5,4.0,4.0,4.0,3.0,0,5.0,35,45
