# Navigation

Splitting the dataset into above and below median for several personality traits and building BoW models seperately on each of them.

In [16]:
import pandas as pd
import os as os
import json as js
import gzip
import numpy as np
import matplotlib.pyplot as plt
import pickle

#importing libraries for NLP
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import Counter

#Text Classification Material
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

from sklearn import  linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MAHAM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Hierarchical Model

# Exploration

In [2]:
with open ('df_07_08', 'rb') as fr:
    df = pickle.load(fr)

In [6]:
df.shape

(18810, 68)

In [5]:
with open ('data_26_07_w_corpus', 'rb') as fr:
    df2 = pickle.load(fr)
df2.shape

(18810, 59)

In [7]:
#Calculating the medians
median_O = df.loc[:,"Open"].median()
median_N = df.loc[:,"Neuro"].median()
median_A = df.loc[:,"Agree"].median()

# dataframe with high openness
df_H_O = df[df.Open >= median_O]
# df with low openness
df_L_O = df[df.Open < median_O]
# dataframe with high openness
df_H_N = df[df.Neuro >= median_N]
# df with low openness
df_L_N = df[df.Neuro < median_N]

# dataframe with high Agreeableness
df_H_A = df[df.Agree >= median_A]
# df with low Agreeableness
df_L_A = df[df.Agree < median_A]

In [8]:
median_A, median_N, median_O

(-0.21503994493254414, 0.3723573823917685, -0.13265988405009393)

In [9]:
import math
# df with random split
# reshuffle first
df = df.sample(frac=1).reset_index(drop=True)
half = math.ceil(len(df)/2)
df1 = df.iloc[:half,:]
df2 = df.iloc[half:,]

In [10]:
dataframes = [df1,df2,df_H_O,df_L_O,df_H_N,df_L_N,df_H_A,df_L_A]
dataframes_names = ["df1","df2","df_H_O","df_L_O","df_H_N","df_L_N","df_H_A","df_L_A"]
for dfx, name in zip(dataframes,dataframes_names):
    print (name, dfx.shape)

df1 (9405, 68)
df2 (9405, 68)
df_H_O (9405, 68)
df_L_O (9405, 68)
df_H_N (12248, 68)
df_L_N (6562, 68)
df_H_A (14964, 68)
df_L_A (3846, 68)


# Generate Results 

In [12]:
# df1 and df2 are random splits and df_H_O means High Openness, 
# df_L_O means low opennes etc, I split my median
dataframes = [df1,df2,df_H_O,df_L_O,df_H_N,df_L_N,df_H_A,df_L_A]
dataframes_names = ["df1","df2","df_H_O","df_L_O","df_H_N","df_L_N","df_H_A","df_L_A"]

In [13]:
for dfx, name in zip(dataframes,dataframes_names):
    # pre processing for BoW model
    cv = CountVectorizer(max_features = 500, ngram_range=(1,2))
    XX = cv.fit_transform(dfx["corpus"]).toarray()
    # using relative word counts instead of absolute ones
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(XX).toarray()
    X = tfidf
    y=dfx["good"]
    acc=[]
    # Running multiple runs to obtain an average
    for seed in range(30):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = seed)
        log = LogisticRegression(solver='liblinear')#regularization is applied by default
        log.fit(X_train, y_train)
        accuracy=log.score(X_test,y_test)
        acc.append(accuracy)
    # printing the name of the dataset, the average accuracy and the size of the dataset
    print(name," ",sum(acc)/len(acc), len(dfx))

df1   0.8186366213151925 9405
df2   0.8144274376417233 9405
df_H_O   0.813421201814059 9405
df_L_O   0.818792517006803 9405
df_H_N   0.8175919878075331 12248
df_L_N   0.8214503351614868 6562
df_H_A   0.8185957408892452 14964
df_L_A   0.7974012474012474 3846
