In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import string

print(os.listdir("./dataset/"))

['clean_test.csv', 'clean_train.csv', 'sample_submission.csv', 'survey_questions.csv', 'test.csv', 'train.csv']


In [2]:
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import sent_tokenize, word_tokenize, WhitespaceTokenizer
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import RegexpTokenizer
from nltk import pos_tag

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from wordcloud import WordCloud

# add sentiment anaylsis columns
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# create doc2vec vector columns
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
df_train = pd.read_csv('./dataset/train.csv')
df_test = pd.read_csv('./dataset/test.csv')
df_sample = pd.read_csv('./dataset/sample_submission.csv')

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30336 entries, 0 to 30335
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ID              30336 non-null  int64  
 1   Place           30336 non-null  object 
 2   location        19082 non-null  object 
 3   date            30336 non-null  object 
 4   status          30336 non-null  object 
 5   job_title       30336 non-null  object 
 6   summary         30284 non-null  object 
 7   positives       30336 non-null  object 
 8   negatives       30336 non-null  object 
 9   advice_to_mgmt  17059 non-null  object 
 10  score_1         27150 non-null  float64
 11  score_2         24286 non-null  float64
 12  score_3         27167 non-null  float64
 13  score_4         27145 non-null  float64
 14  score_5         26851 non-null  float64
 15  score_6         30336 non-null  int64  
 16  overall         30336 non-null  float64
dtypes: float64(6), int64(2), object

In [5]:
df_train.sample(5)

Unnamed: 0,ID,Place,location,date,status,job_title,summary,positives,negatives,advice_to_mgmt,score_1,score_2,score_3,score_4,score_5,score_6,overall
24193,54033,startup_6,"Redmond, WA","Jan 8, 2016",Current Employee,Senior Software Development Engineer,Great Place to Work,A very high caliber group of people to work wi...,Sometimes the work load can effect the work / ...,,4.0,5.0,5.0,5.0,4.0,0,5.0
2304,5199,startup_1,,"Jul 9, 2018",Former Employee,Anonymous Employee,google review,very very very very good,"actually nothing quiet bad, thats ok",,,,,,,0,4.0
2917,6574,startup_1,,"Dec 7, 2015",Former Employee,Anonymous Employee,Best place to work in the world,"good working environment, nice peers and super...",Good place to develop your self.,,5.0,5.0,5.0,5.0,5.0,0,3.0
10878,24493,startup_4,"Seattle, WA","Dec 20, 2012",Former Employee,Software Development Engineer,sad place to be,learn a lot of advanced distributed system tec...,"pager duty, heavy work load, low compensation",,1.0,2.0,1.0,1.0,1.0,7,2.0
25559,57176,startup_6,,"Jul 23, 2014",Former Employee,Anonymous Employee,"Good place to work, but beware","Good salary/benefits, hardworking, intelligent...","politics, bureaucracy, test not respected or h...",,5.0,4.0,3.0,5.0,3.0,0,5.0


In [6]:
df_train.shape, df_test.shape

((30336, 17), (29272, 16))

In [7]:
df_train.isnull().sum()

ID                    0
Place                 0
location          11254
date                  0
status                0
job_title             0
summary              52
positives             0
negatives             0
advice_to_mgmt    13277
score_1            3186
score_2            6050
score_3            3169
score_4            3191
score_5            3485
score_6               0
overall               0
dtype: int64

In [8]:
col = ['score_1', 'score_2', 'score_3', 'score_4', 'score_5']
for c in col:
    df_train[c].fillna(df_train[c].dropna().median(), inplace=True)
    df_test[c].fillna(df_train[c].dropna().median(), inplace=True)

col1 = ['negatives', 'summary', 'advice_to_mgmt']
for c in col1:
    df_train[c].fillna('', inplace=True)
    df_test[c].fillna('', inplace=True)

In [9]:
drop_col = ['ID', 'location', 'date']
df_train.drop(columns=drop_col, inplace=True)
df_test.drop(columns=drop_col, inplace=True)

In [10]:
df_train.shape, df_test.shape

((30336, 14), (29272, 13))