## 1. Get the data

In [4]:
import numpy as np
import operator
import os
import pandas as pd
from collections import defaultdict
from nltk.tokenize import TweetTokenizer
from sklearn import cross_validation
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB

# Pre-process the data
training_data = []

# Set up tokenizer, bag of words, and make a defaultdict to hold a tuple of (tweets_text, [scores]) for each
# Loop through the files
tokenizer = TweetTokenizer(strip_handles=True)
tweets_and_scores = defaultdict(tuple)
#bag_of_words = []
word_dict = defaultdict(int)

# Grab the scores
scores = pd.DataFrame.from_csv('scraped_twitter_score.csv')
print(list(scores.ix["lordhamstr"]))

handles_with_scores = list(scores.index)
# Grab the tweets and scores
for tweet_file in os.listdir('statuses'):
    if tweet_file.endswith('_output.txt'):
        # Get the handle
        beginning = 9
        ending = tweet_file.index('_output.txt')
        handle = tweet_file[beginning:ending]
        
        if handle in handles_with_scores:
            # Put the text and scores in a dictionary
            with open('statuses/' + tweet_file, 'r', encoding='utf8') as tweet_text:
                score = list(scores.ix[handle])
                text = tokenizer.tokenize(tweet_text.read().replace('\nTWEETLINEBREAK\n', ' '))
                #bag_of_words += text
                #for word in text:
                #    word_dict[word] += 1
                tweets_and_scores[handle] = (text, score)
            
#bag_of_words = set(bag_of_words)
print(len(word_dict))
#bag_of_words = {word:n for word,n in word_dict.items() if n in range(40,100)}.keys()
#bag_of_words = list({word:n for word,n in word_dict.items() if n > 2}.keys())
bag_of_words = ['interview', 'I', 'video', '😊', 'lol', 'Boughton', '(', 'na', 'beer', 'and', '🐍', ':', 'Singapore', 'fucking', 'on', '-', '?', '#songoftheday', 'of', 'a', "I'm", 'you', 'that', '"', '/', 'i', "'", ')', 'single', 'me', ';', 'for', 'The', '’', '*', 'my', '#seizulogic', 'like', 'am', 'Facebook', 'o', '❤', 'ProKabaddi', '#seizuBOTty', 'is', '.', '🔥', ',', '..', 'LOL', 'your', '...', '!']
num_words = len(bag_of_words)
print(num_words)
num_observations = len(tweets_and_scores)

# From this, you have a dictionary called tweets_and_scores and a list called bag_of_words

# Convert tweets_texts into vector of words

# Make X matrix of size [len(trainingdata), bag_of_words_length]
X = np.zeros([num_observations, num_words + 1])

# Make Y matrix of size [len(trainingdata), 5] to hold personality scores
Y_array = [np.zeros([num_observations, 1]) for x in range(0,5)]

word_positions = defaultdict(int)
index = 0
for word in bag_of_words:
    word_positions[word] = index
    index += 1

# Fill them
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    word_index = 0
    word_length = 0
    for word in words:
        X[index][word_positions[word]] += 1
        word_length += len(word)
        
    X[index][-1] = word_length / len(words)
    
    score_index = 0
    for score in scores:
        Y_array[score_index][index] = score
        score_index += 1
    
    print(index)
    index += 1

[14.0, 66.0, 2.0, 1.0, 89.0]
0
53
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


## 2. Train the model
### This is for quickly testing different score thresholds

In [173]:
index = 0
for (key, (words, scores)) in tweets_and_scores.items():
    score_index = 0
    for score in scores:
        if score > 50:
            Y_array[score_index][index] = 1
        else:
            Y_array[score_index][index] = 0
        score_index += 1
    
    print(index)
    index += 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37


In [18]:
from sklearn import metrics
from sklearn.cross_validation import KFold, cross_val_score, cross_val_predict
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import matplotlib.pyplot as plt


df_o = pd.DataFrame(columns=["SVR", "OLS", "Lasso"], index=["MAE", "MSQE", "MedAE", "R2"])
df_c = pd.DataFrame(columns=["SVR", "OLS", "Lasso"], index=["MAE", "MSQE", "MedAE", "R2"])
df_e = pd.DataFrame(columns=["SVR", "OLS", "Lasso"], index=["MAE", "MSQE", "MedAE", "R2"])
df_a = pd.DataFrame(columns=["SVR", "OLS", "Lasso"], index=["MAE", "MSQE", "MedAE", "R2"])
df_n = pd.DataFrame(columns=["SVR", "OLS", "Lasso"], index=["MAE", "MSQE", "MedAE", "R2"])

dfs = [df_o, df_c, df_e, df_a, df_n]

n_folds = 4
k_fold = KFold(num_observations, n_folds=n_folds, shuffle=True, random_state=0)

if False:

    index = 0
    for Y in Y_array:
        mnb = SVR()
        predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4)
        dfs[index]["SVR"][0] = metrics.mean_absolute_error(Y, predictions)
        dfs[index]["SVR"][1] = metrics.mean_squared_error(Y, predictions)
        dfs[index]["SVR"][2] = metrics.median_absolute_error(Y, predictions)
        dfs[index]["SVR"][3] = metrics.r2_score(Y, predictions)

        mnb = LinearRegression()
        predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4)
        dfs[index]["OLS"][0] = metrics.mean_absolute_error(Y, predictions)
        dfs[index]["OLS"][1] = metrics.mean_squared_error(Y, predictions)
        dfs[index]["OLS"][2] = metrics.median_absolute_error(Y, predictions)
        dfs[index]["OLS"][3] = metrics.r2_score(Y, predictions)

        y = Y
        fig, ax = plt.subplots()
        ax.scatter(y, predictions)
        ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
        ax.set_xlabel('Measured')
        ax.set_ylabel('Predicted')
        plt.show()

        #mnb = Lasso(max_iter=10000)
        #predictions = cross_val_predict(mnb, X, Y.ravel(), cv=4)
        dfs[index]["Lasso"][0] = metrics.mean_absolute_error(Y, predictions)
        dfs[index]["Lasso"][1] = metrics.mean_squared_error(Y, predictions)
        dfs[index]["Lasso"][2] = metrics.median_absolute_error(Y, predictions)
        dfs[index]["Lasso"][3] = metrics.r2_score(Y, predictions)
        index += 1


        svr = SVR()
        ols = LinearRegression()
        lasso = Lasso(max_iter=5000)

        print(X.shape)
        print()

    index = 0
    for Y in Y_array:
        #lasso.fit(X,Y.ravel())
        #model = SelectFromModel(lasso, prefit=True)
        #X_new = model.transform(X)

        #lasso.fit(X, Y.ravel())
        svr.fit(X,Y.ravel())
        ols.fit(X,Y.ravel())
        predict_svr = svr.predict(X)
        predict_ols = ols.predict(X)
        #predict_lasso = lasso.predict(X)


        dfs[index]["SVR"][0] = metrics.mean_absolute_error(Y, predict_svr)
        dfs[index]["SVR"][1] = metrics.mean_squared_error(Y, predict_svr)
        dfs[index]["SVR"][2] = metrics.median_absolute_error(Y, predict_svr)
        dfs[index]["SVR"][3] = metrics.r2_score(Y, predict_svr)

        dfs[index]["OLS"][0] = metrics.mean_absolute_error(Y, predict_ols)
        dfs[index]["OLS"][1] = metrics.mean_squared_error(Y, predict_ols)
        dfs[index]["OLS"][2] = metrics.median_absolute_error(Y, predict_ols)
        dfs[index]["OLS"][3] = metrics.r2_score(Y, predict_ols)

        dfs[index]["Lasso"][0] = metrics.mean_absolute_error(Y, predict_lasso)
        dfs[index]["Lasso"][1] = metrics.mean_squared_error(Y, predict_lasso)
        dfs[index]["Lasso"][2] = metrics.median_absolute_error(Y, predict_lasso)
        dfs[index]["Lasso"][3] = metrics.r2_score(Y, predict_lasso)

        index += 1

    #print(list(map(lambda x: bag_of_words[x], model.get_support(indices=True))))

index = 0
Y_array_new = [np.zeros([num_observations, 1]) for x in range(0,5)]
for i in range(len(Y_array_new)):
    Y_array_new[i] = [y > 50 for y in Y_array[i]]
    
    mnb = LinearRegression()
    predictions = cross_val_predict(mnb, X, Y_array[i].ravel(), cv=4)
    prediction_new = [pred > 50 for pred in predictions]
    dfs[index]["SVR"][0] = metrics.accuracy_score(Y_array_new[i], prediction_new)
    dfs[index]["SVR"][1] = metrics.precision_score(Y_array_new[i], prediction_new)
    dfs[index]["SVR"][2] = metrics.recall_score(Y_array_new[i], prediction_new)
    dfs[index]["SVR"][3] = metrics.f1_score(Y_array_new[i], prediction_new)

    index += 1
    

print()
for df in dfs:
    print(df)
    print()


            SVR  OLS Lasso
MAE    0.605263  NaN   NaN
MSQE       0.75  NaN   NaN
MedAE  0.724138  NaN   NaN
R2     0.736842  NaN   NaN

            SVR  OLS Lasso
MAE    0.447368  NaN   NaN
MSQE   0.521739  NaN   NaN
MedAE  0.545455  NaN   NaN
R2     0.533333  NaN   NaN

            SVR  OLS Lasso
MAE    0.605263  NaN   NaN
MSQE   0.590909  NaN   NaN
MedAE  0.684211  NaN   NaN
R2     0.634146  NaN   NaN

            SVR  OLS Lasso
MAE    0.657895  NaN   NaN
MSQE   0.785714  NaN   NaN
MedAE  0.758621  NaN   NaN
R2      0.77193  NaN   NaN

            SVR  OLS Lasso
MAE    0.657895  NaN   NaN
MSQE       0.65  NaN   NaN
MedAE  0.684211  NaN   NaN
R2     0.666667  NaN   NaN



In [20]:
from sklearn.externals import joblib

lookup = "ocean"
index = 0
for Y in Y_array:
    mnb = LinearRegression()
    mnb.fit(X, Y.ravel())
    coefs = mnb.coef_[0]

    bag = list(bag_of_words)

    # Use initial model counts for all but extroversion
    joblib.dump(mnb, 'regression_' + lookup[index] + '.pkl')
    index += 1
joblib.dump(list(bag_of_words), 'regression__bag_of_words.pkl')

['regression__bag_of_words.pkl']