## 1. Get the data

In [1]:
import numpy as np
import operator
import os
import pandas as pd
from collections import defaultdict
from nltk.tokenize import TweetTokenizer
from sklearn import cross_validation
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

# Pre-process the data
training_data = []

# Set up tokenizer, bag of words, and make a defaultdict to hold a tuple of (tweets_text, [scores]) for each
# Loop through the files
tokenizer = TweetTokenizer(strip_handles=True)
tweets_and_scores = defaultdict(tuple)
#bag_of_words = []
word_dict = defaultdict(int)

# Grab the scores
scores = pd.DataFrame.from_csv('scraped_twitter_score.csv')
print(list(scores.ix["lordhamstr"]))

# Grab the tweets and scores
handles = list(scores.index)
for tweet_file in os.listdir('statuses'):
    if tweet_file.endswith('_output.txt'):
        # Get the handle
        beginning = 9
        ending = tweet_file.index('_output.txt')
        handle = tweet_file[beginning:ending]
        
        if handle in handles:
            # Put the text and scores in a dictionary
            with open('statuses/' + tweet_file, 'r', encoding='utf8') as tweet_text:
                score = list(scores.ix[handle])
                text = tokenizer.tokenize(tweet_text.read().replace('\nTWEETLINEBREAK\n', ' '))
                #bag_of_words += text
                for word in text:
                    word_dict[word] += 1
                tweets_and_scores[handle] = (text, score)
            
#bag_of_words = set(bag_of_words)
print(len(word_dict))
bag_of_words = {word:n for word,n in word_dict.items() if n in range(40,100)}.keys()
num_words = len(bag_of_words)
print(num_words)
num_observations = len(tweets_and_scores)

# From this, you have a dictionary called tweets_and_scores and a list called bag_of_words

# Convert tweets_texts into vector of words

# Make X matrix of size [len(trainingdata), bag_of_words_length]
X = np.zeros([num_observations, num_words + 1])

# Make Y matrix of size [len(trainingdata), 5] to hold personality scores
Y_array = [np.zeros([num_observations, 1]) for x in range(0,5)]

# Fill them
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    word_index = 0
    wordcount = 0
    wordlen = 0
    for word in bag_of_words:
        if word in words:
            X[index][word_index] = words.count(word)
            wordcount += 1
            wordlen += len(word)
        word_index += 1
    X[index][-1] = wordlen / wordcount
    
    
    score_index = 0
    for score in scores:
        if score > 50:
            Y_array[score_index][index] = 1
        else:
            Y_array[score_index][index] = 0
        score_index += 1
    
    print(index)
    index += 1

[14.0, 66.0, 2.0, 1.0, 89.0]
102893
1273
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


## 2. Train the model
### This is for quickly testing different score thresholds

In [173]:
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    score_index = 0
    for score in scores:
        if score > 50:
            Y_array[score_index][index] = 1
        else:
            Y_array[score_index][index] = 0
        score_index += 1
    
    print(index)
    index += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [2]:
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
from sklearn.svm import SVR
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

df_o = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_c = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_e = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_a = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_n = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])

dfs = [df_o, df_c, df_e, df_a, df_n]

n_folds = 4
k_fold = KFold(num_observations, n_folds=n_folds, shuffle=True, random_state=0)

index = 0
for Y in Y_array:
    mnb = GaussianNB()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Gaussian"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Gaussian"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Gaussian"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Gaussian"][3] = metrics.f1_score(Y, predictions)
    
    mnb = MultinomialNB()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["MNB"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["MNB"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["MNB"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["MNB"][3] = metrics.f1_score(Y, predictions)
    
    mnb = LogisticRegression()
    predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Bernoulli"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Bernoulli"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Bernoulli"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Bernoulli"][3] = metrics.f1_score(Y, predictions)
    index += 1
    
print("Folds: " + str(n_folds))
print()
for df in dfs:
    print(df)
    print()

Folds: 4

           Gaussian       MNB Bernoulli
Accuracy   0.736842  0.842105  0.763158
Precision  0.771429  0.848485  0.763158
Recall     0.931034  0.965517         1
F1          0.84375  0.903226  0.865672

           Gaussian       MNB Bernoulli
Accuracy   0.447368  0.605263  0.526316
Precision  0.521739   0.62963  0.558824
Recall     0.545455  0.772727  0.863636
F1         0.533333  0.693878  0.678571

           Gaussian       MNB Bernoulli
Accuracy        0.5  0.578947  0.368421
Precision       0.5  0.548387  0.407407
Recall     0.631579  0.894737  0.578947
F1          0.55814      0.68  0.478261

           Gaussian       MNB Bernoulli
Accuracy   0.789474  0.815789  0.763158
Precision  0.783784  0.823529  0.763158
Recall            1  0.965517         1
F1         0.878788  0.888889  0.865672

           Gaussian       MNB Bernoulli
Accuracy   0.657895  0.657895  0.631579
Precision  0.666667  0.666667  0.647059
Recall     0.631579  0.631579  0.578947
F1         0.648649  0.648

In [249]:
from sklearn.externals import joblib

index = 0
for Y in Y_array:
    mnb = MultinomialNB()
    mnb.fit(X, Y.ravel())
    coefs = mnb.coef_[0]

    bag = list(bag_of_words)

    word_coefs = []
    for i in range(len(bag)):
        word_coefs.append((bag[i], coefs[i]))

    word_coefs.sort(key=lambda tup: tup[1])

    predictions = mnb.predict(X)
    print(metrics.accuracy_score(Y, predictions))
    print(metrics.precision_score(Y, predictions))
    print(metrics.recall_score(Y, predictions))
    print(metrics.f1_score(Y, predictions))
    
    lookup = {"ocean".index(ch): ch for ch in "ocean"}
    print(lookup)

    # Use initial model counts for all but extroversion
    joblib.dump(mnb, 'initial_model_counts_' + lookup[index] + '.pkl')
    index += 1
    joblib.dump(list(bag_of_words), 'initial_model_counts' + str(index) + '_bag_of_words.pkl')

0.973684210526
0.966666666667
1.0
0.983050847458
{0: 'o', 1: 'c', 2: 'e', 3: 'a', 4: 'n'}
0.947368421053
0.916666666667
1.0
0.95652173913
{0: 'o', 1: 'c', 2: 'e', 3: 'a', 4: 'n'}
0.947368421053
0.904761904762
1.0
0.95
{0: 'o', 1: 'c', 2: 'e', 3: 'a', 4: 'n'}
0.947368421053
0.935483870968
1.0
0.966666666667
{0: 'o', 1: 'c', 2: 'e', 3: 'a', 4: 'n'}
1.0
1.0
1.0
1.0
{0: 'o', 1: 'c', 2: 'e', 3: 'a', 4: 'n'}


In [17]:
from sklearn.feature_selection import SelectFromModel

logit = LogisticRegression()
logit.fit(X, Y.ravel())
model = SelectFromModel(logit, prefit=True)

X_new = model.transform(X)

df_o = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_c = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_e = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_a = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_n = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])

dfs = [df_o, df_c, df_e, df_a, df_n]

index = 0
for Y in Y_array:
    mnb = GaussianNB()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Gaussian"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Gaussian"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Gaussian"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Gaussian"][3] = metrics.f1_score(Y, predictions)
    
    mnb = MultinomialNB()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["MNB"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["MNB"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["MNB"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["MNB"][3] = metrics.f1_score(Y, predictions)
    
    mnb = LogisticRegression()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Bernoulli"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Bernoulli"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Bernoulli"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Bernoulli"][3] = metrics.f1_score(Y, predictions)
    index += 1
    
print("Folds: " + str(n_folds))
print()
for df in dfs:
    print(df)
    print()

Folds: 4

           Gaussian       MNB Bernoulli
Accuracy   0.736842       0.5  0.736842
Precision  0.756757  0.727273  0.787879
Recall     0.965517  0.551724  0.896552
F1         0.848485  0.627451   0.83871

           Gaussian       MNB Bernoulli
Accuracy   0.421053  0.578947  0.578947
Precision       0.5  0.636364       0.6
Recall     0.545455  0.636364  0.818182
F1         0.521739  0.636364  0.692308

           Gaussian       MNB Bernoulli
Accuracy   0.552632  0.578947  0.526316
Precision      0.55  0.578947   0.52381
Recall     0.578947  0.578947  0.578947
F1         0.564103  0.578947      0.55

           Gaussian       MNB Bernoulli
Accuracy   0.763158  0.631579  0.710526
Precision  0.763158  0.758621      0.75
Recall            1  0.758621  0.931034
F1         0.865672  0.758621  0.830769

           Gaussian       MNB Bernoulli
Accuracy   0.710526  0.763158  0.789474
Precision  0.681818    0.8125  0.789474
Recall     0.789474  0.684211  0.789474
F1         0.731707  0.742

### Using PCA to reduce dimensionality

In [46]:
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
orubt
pca = PCA(n_components=7)
pca.fit(X)
X_new = pca.transform(X)

df_o = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_c = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_e = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_a = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])
df_n = pd.DataFrame(columns=["Gaussian", "MNB", "Bernoulli"], index=["Accuracy", "Precision", "Recall", "F1"])

dfs = [df_o, df_c, df_e, df_a, df_n]

index = 0
for Y in Y_array:
    mnb = LinearSVC()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Gaussian"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Gaussian"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Gaussian"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Gaussian"][3] = metrics.f1_score(Y, predictions)
    
    mnb = RandomForestClassifier()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["MNB"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["MNB"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["MNB"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["MNB"][3] = metrics.f1_score(Y, predictions)
    
    mnb = LogisticRegression()
    predictions = cross_val_predict(mnb, X_new, Y.ravel(), cv=4, n_jobs=1)
    dfs[index]["Bernoulli"][0] = metrics.accuracy_score(Y, predictions)
    dfs[index]["Bernoulli"][1] = metrics.precision_score(Y, predictions)
    dfs[index]["Bernoulli"][2] = metrics.recall_score(Y, predictions)
    dfs[index]["Bernoulli"][3] = metrics.f1_score(Y, predictions)
    index += 1

for df in dfs:
    print(df)
    print()


(38, 1274)
(38, 7)
           Gaussian       MNB Bernoulli
Accuracy   0.736842  0.684211  0.815789
Precision  0.806452  0.774194   0.84375
Recall     0.862069  0.827586  0.931034
F1         0.833333       0.8  0.885246

           Gaussian       MNB Bernoulli
Accuracy   0.631579  0.526316  0.605263
Precision  0.681818  0.583333   0.62963
Recall     0.681818  0.636364  0.772727
F1         0.681818  0.608696  0.693878

           Gaussian       MNB Bernoulli
Accuracy   0.526316  0.473684  0.526316
Precision  0.526316  0.466667  0.526316
Recall     0.526316  0.368421  0.526316
F1         0.526316  0.411765  0.526316

           Gaussian       MNB Bernoulli
Accuracy   0.763158  0.815789  0.763158
Precision    0.8125  0.823529  0.794118
Recall     0.896552  0.965517  0.931034
F1         0.852459  0.888889  0.857143

           Gaussian       MNB Bernoulli
Accuracy   0.710526  0.578947  0.684211
Precision  0.681818  0.615385  0.684211
Recall     0.789474  0.421053  0.684211
F1         0.7317

In [47]:
from sklearn.externals import joblib

index = 0
lookup = {"ocean".index(ch): ch for ch in "ocean"}
for Y in Y_array:
    logit = LogisticRegression()
    logit.fit(X_new, Y.ravel())
    joblib.dump(logit, 'model/pca_' + lookup[index] + '.pkl')
    index += 1
    
    predictions = logit.predict(X_new)
    print(metrics.accuracy_score(Y, predictions))
    print(metrics.precision_score(Y, predictions))
    print(metrics.recall_score(Y, predictions))
    print(metrics.f1_score(Y, predictions))
    print()

joblib.dump(list(bag_of_words), 'model/pca_bag_of_words.pkl')
joblib.dump(pca, 'model/pca_pca.pkl')

0.842105263158
0.870967741935
0.931034482759
0.9

0.684210526316
0.678571428571
0.863636363636
0.76

0.789473684211
0.823529411765
0.736842105263
0.777777777778

0.868421052632
0.852941176471
1.0
0.920634920635

0.815789473684
0.8
0.842105263158
0.820512820513



['model/pca_pca.pkl',
 'model/pca_pca.pkl_01.npy',
 'model/pca_pca.pkl_02.npy',
 'model/pca_pca.pkl_03.npy',
 'model/pca_pca.pkl_04.npy']