## Import Data and Tools

In [1]:
import pandas as pd
Ecsv=pd.read_csv('Edata.csv')  
Icsv=pd.read_csv('Idata.csv') 
EIcsv=pd.concat([Ecsv,Icsv])
rowInd=pd.Series(range(len(EIcsv)))
EIcsv=EIcsv.set_index(rowInd)
EIcsv=EIcsv.drop(['Unnamed: 0'],axis=1) #raw comments with labels as a dataframe
print('# of introvert comments are',Icsv.shape[0])
print('# of extrovert comments are',Ecsv.shape[0])

# of introvert comments are 3602
# of extrovert comments are 2614


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
import gensim
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk import word_tokenize
from gensim.models import word2vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.corpus import stopwords
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils import shuffle

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
## initialize sklearn models with balanced weight(unbalanced dataset)
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression(class_weight='balanced')
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
from sklearn.ensemble import RandomForestClassifier
randfor= RandomForestClassifier(class_weight='balanced')
from sklearn.svm import SVC
svm=SVC(kernel='linear',class_weight='balanced')

In [4]:
stop = stopwords.words('english') ##initialize stopwords to remove
stopwords= r'\b(?:{})\b'.format('|'.join(stop))

def FeatVect(model, input_docs):
    sentences = input_docs
    feature_vectors = [model.infer_vector(doc.words, steps=20) for doc in sentences]
    return feature_vectors

In [5]:
##initialize parameter grids for different models for cv
from scipy.stats import uniform
logreg_grid=dict(C=uniform(loc=0, scale=4),penalty=['l2', 'l1'],solver=['liblinear','saga'])
neigh_grid = {'n_neighbors': np.arange(1, 8), 'weights': ['uniform','distance']}
svm_grid=dict(C=[.01,4],gamma=[.01,1]) #randomizedsearch samples uniformly 


## Modify Data via Doc2Vec

In [6]:
EIcsv['text']=EIcsv.loc[:,'text'].str.replace(stopwords, '') #remove stop words
EIcsv['text']=EIcsv.loc[:,'text'].str.replace(r'[^\w\s]', '') #removed punctuation
EIcsv_tuple=[tuple(x) for x in EIcsv.to_records(index=False)] #turn into tuples
EIcsv_tagged=[TaggedDocument(words=word_tokenize(text.lower()), tags=[str(label)]) #creating tagged-document
    for label,text in EIcsv_tuple]
docvec = Doc2Vec(dm=1, vector_size=100, hs=0, min_count=2, sample = 0, alpha=0.025, min_alpha=0.001) #dimensions=100
docvec.build_vocab(EIcsv_tagged) 
docvec.train(EIcsv_tagged,total_examples=len(EIcsv_tagged), epochs=15) #training docvec model
EIdata=FeatVect(docvec,EIcsv_tagged) ##getting feature vectors now
EIlabels=EIcsv.iloc[:,0] #turn back into dataframe with labels so we can train,test,split
EIdf=pd.DataFrame(EIdata)
EIdf['label']=EIlabels



## Iterate over Training Classifiers

In [7]:
trials=[1,2,3]
splits=[0.8,0.5,0.2]
models=[logreg,neigh,svm]
test_scores=[]
valid_scores=[]
train_scores=[]
for model in models:
    if model is logreg:
        param_grid=logreg_grid
    elif model is neigh:
        param_grid=neigh_grid
    else:
        param_grid=svm_grid
    for split in splits:
        for trial in trials:
            shuffled_data=shuffle(EIdf, random_state=trial)
            train, test= train_test_split(shuffled_data,test_size=split,random_state=trial)
            test_labels=test.iloc[:,-1]  
            train_labels=train.iloc[:,-1]
            train_vect=train.iloc[:,:-1]
            test_vect=test.iloc[:,:-1]
            trainMod=model.fit(train_vect,train_labels) #train model without cv
            TrainScore=trainMod.score(train_vect,train_labels)
            train_scores.append(TrainScore)
            RndSrch = RandomizedSearchCV(model, param_grid, n_iter=14, cv=5,n_jobs=6) #cross-validate
            tuned=RndSrch.fit(train_vect,train_labels)
            valid_scores.append(max(tuned.cv_results_['mean_test_score']))
            BestParams =tuned.best_params_
            TrainTuned=model.set_params(**BestParams)  #train model using tuned parameters from cv
            TrainTuned.fit(train_vect,train_labels)
            pred_labels=TrainTuned.predict(test_vect)  #test model using tuned parameters 
            test_scores.append(accuracy_score(test_labels,pred_labels))



## Compare Scores

In [10]:
## 3 models, 3 partitions, 3 trials is 27 scores per type of score
#scores 0-8: logreg, scores 9-17: neigh, scores 18-26: svm
from statistics import mean
TrAvgs=[mean(train_scores[0:8]),mean(train_scores[9:17]),mean(train_scores[18:26])]
ValAvgs=[mean(valid_scores[0:8]), mean(valid_scores[9:17]),mean(valid_scores[18:26])] 
TestAvgs=[mean(test_scores[0:8]),mean(test_scores[9:17]),mean(test_scores[18:26])]

averages=pd.DataFrame(list(zip(TrAvgs,ValAvgs,TestAvgs)),index=['LogReg','KNN','SVM'],columns=['Train','Validation','Test'])
print(averages)
print('Best model for train dataset is',list(averages.index[averages.Train==max(averages.Train)]))
print('Best model for validation dataset is',list(averages.index[averages.Validation==max(averages.Validation)]))
print('Best model for test dataset is',list(averages.index[averages.Test==max(averages.Test)]))

           Train  Validation      Test
LogReg  0.833221    0.828691  0.833130
KNN     0.849302    0.758281  0.756420
SVM     0.832547    0.824784  0.830814
Best model for train dataset is ['KNN']
Best model for validation dataset is ['LogReg']
Best model for test dataset is ['LogReg']


## Random Forest for Fun

In [14]:
from sklearn.model_selection import RandomizedSearchCV
train, test= train_test_split(EIdf,test_size=0.2,random_state=1) #train,test split
test_labels=test.iloc[:,-1]  
train_labels=train.iloc[:,-1]
train_vect=train.iloc[:,:-1]
test_vect=test.iloc[:,:-1]

param_grid = {
    'bootstrap': [True,False],
    'max_depth': [10,50,100,150,200],
    'n_estimators': [100, 200, 500, 1000]}


trainMod=randfor.fit(train_vect,train_labels)
train_score=trainMod.score(train_vect,train_labels)
RndSrch = RandomizedSearchCV(randfor, param_grid,cv = 5, n_jobs=6)
tuned=RndSrch.fit(train_vect,train_labels)
val_score=max(tuned.cv_results_['mean_test_score'])
BestParams=tuned.best_params_
TrainTuned=randfor.set_params(**BestParams)
TrainTuned.fit(train_vect,train_labels)
pred_labels=TrainTuned.predict(test_vect)

print('Training score for RandFor is:',train_score)
print('Best Validation score for RandFor is', val_score)
test_score=accuracy_score(pred_labels,test_labels)
print('Tuned test score for RandFor is:',test_score)

Training score for RandFor is: 0.999597747385358
Best Validation score for RandFor is 0.8131536604987932
Tuned test score for RandFor is: 0.8279742765273312
