## 1. Get the data

In [236]:
import numpy as np
import operator
import os
import pandas as pd
from collections import defaultdict
from nltk.tokenize import TweetTokenizer
from sklearn import cross_validation
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

# Pre-process the data
training_data = []

# Set up tokenizer, bag of words, and make a defaultdict to hold a tuple of (tweets_text, [scores]) for each
# Loop through the files
tokenizer = TweetTokenizer(strip_handles=True)
tweets_and_scores = defaultdict(tuple)
#bag_of_words = []
word_dict = defaultdict(int)

# Grab the scores
scores = pd.DataFrame.from_csv('scraped_twitter_score.csv')
print(list(scores.ix["lordhamstr"]))

# Grab the tweets and scores
for tweet_file in os.listdir('statuses'):
    if tweet_file.endswith('_output.txt'):
        # Get the handle
        beginning = 9
        ending = tweet_file.index('_output.txt')
        handle = tweet_file[beginning:ending]
        
        # Put the text and scores in a dictionary
        with open('statuses/' + tweet_file, 'r', encoding='utf8') as tweet_text:
            score = list(scores.ix[handle])
            text = tokenizer.tokenize(tweet_text.read().replace('\nTWEETLINEBREAK\n', ' '))
            #bag_of_words += text
            for word in text:
                word_dict[word] += 1
            tweets_and_scores[handle] = (text, score)
            
#bag_of_words = set(bag_of_words)
print(len(word_dict))
bag_of_words = {word:n for word,n in word_dict.items() if n in range(40,100)}.keys()
num_words = len(bag_of_words)
print(num_words)
num_observations = len(tweets_and_scores)

# From this, you have a dictionary called tweets_and_scores and a list called bag_of_words

# Convert tweets_texts into vector of words

# Make X matrix of size [len(trainingdata), bag_of_words_length]
X = np.zeros([num_observations, num_words])

# Make Y matrix of size [len(trainingdata), 5] to hold personality scores
Y_array = [np.zeros([num_observations, 1]) for x in range(0,5)]

# Fill them
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    word_index = 0
    for word in bag_of_words:
        if word in words:
            X[index][word_index] = words.count(word)
        word_index += 1
    
    score_index = 0
    for score in scores:
        if score > 50:
            Y_array[score_index][index] = 1
        else:
            Y_array[score_index][index] = 0
        score_index += 1
    
    print(index)
    index += 1

[14.0, 66.0, 2.0, 1.0, 89.0]
102893
1273
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


## 2. Train the model
### This is for quickly testing different score thresholds

In [173]:
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    score_index = 0
    for score in scores:
        if score > 50:
            Y_array[score_index][index] = 1
        else:
            Y_array[score_index][index] = 0
        score_index += 1
    
    print(index)
    index += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [238]:
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

df_o = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_c = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_e = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_a = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_n = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])

dfs = [df_o, df_c, df_e, df_a, df_n]

n_folds = 4
k_fold = KFold(num_observations, n_folds=n_folds, shuffle=True, random_state=0)

index = 0
for Y in Y_array:
    mnb = GaussianNB()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=k_fold, n_jobs=1)
    dfs[index]["Gaussian"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Gaussian"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Gaussian"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Gaussian"][3] = metrics.f1_score(Y, predictions)
    
    mnb = MultinomialNB()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=k_fold, n_jobs=1)
    dfs[index]["MNB"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["MNB"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["MNB"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["MNB"][3] = metrics.f1_score(Y, predictions)
    
    mnb = BernoulliNB()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=k_fold, n_jobs=1)
    dfs[index]["Bernoulli"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Bernoulli"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Bernoulli"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Bernoulli"][3] = metrics.f1_score(Y, predictions)
    index += 1
    
print("Folds: " + str(n_folds))
print()
for df in dfs:
    print(df)
    print()

Folds: 4

           Gaussian       MNB Bernoulli
Accuracy   0.736842  0.815789  0.710526
Precision  0.756757  0.823529       0.8
Recall     0.965517  0.965517  0.827586
F1         0.848485  0.888889  0.813559

           Gaussian       MNB Bernoulli
Accuracy   0.421053  0.605263  0.526316
Precision       0.5      0.64  0.576923
Recall     0.545455  0.727273  0.681818
F1         0.521739  0.680851     0.625

           Gaussian       MNB Bernoulli
Accuracy        0.5  0.631579  0.657895
Precision       0.5  0.580645     0.625
Recall     0.526316  0.947368  0.789474
F1         0.512821      0.72  0.697674

           Gaussian       MNB Bernoulli
Accuracy   0.789474  0.789474  0.710526
Precision       0.8  0.818182  0.821429
Recall     0.965517  0.931034  0.793103
F1            0.875  0.870968  0.807018

           Gaussian       MNB Bernoulli
Accuracy        0.5  0.605263  0.421053
Precision       0.5  0.642857       0.4
Recall     0.473684  0.473684  0.315789
F1         0.486486  0.545

In [239]:
from sklearn.externals import joblib

mnb = MultinomialNB()
mnb.fit(X, Y.ravel())
coefs = mnb.coef_[0]

bag = list(bag_of_words)

word_coefs = []
for i in range(len(bag)):
    word_coefs.append((bag[i], coefs[i]))
    
word_coefs.sort(key=lambda tup: tup[1])

predictions = mnb.predict(X)
print(metrics.accuracy_score(Y, predictions))
print(metrics.precision_score(Y, predictions))
print(metrics.recall_score(Y, predictions))
print(metrics.f1_score(Y, predictions))

# Use initial model counts for all but extroversion
joblib.dump(mnb, 'initial_model_counts.pkl')

1.0
1.0
1.0
1.0


['initial_model_counts.pkl',
 'initial_model_counts.pkl_01.npy',
 'initial_model_counts.pkl_02.npy',
 'initial_model_counts.pkl_03.npy',
 'initial_model_counts.pkl_04.npy',
 'initial_model_counts.pkl_05.npy']