# MNB Sentiment Scoring Model

<p> This is a process for training a Multinomial Naive Bayes sentiment classification model using tweets from a Kaggle competition. The trained model will be saved and exported at the end for reuse. </p>

## Packages

In [85]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy.stats import itemfreq
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import re
import os

## Import Training and Test Data

#### Our training data consists of the following attributes:
<ul> 
    <li><b>target:</b> the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)</li>
    <li><b>ids:</b> The id of the tweet ( 2087)</li>
    <li><b>date:</b> the date of the tweet (Sat May 16 23:58:44 UTC 2009)</li>
    <li><b>flag:</b> The query (lyx). If there is no query, then this value is NO_QUERY.</li>
    <li><b>user:</b> the user that tweeted (robotickilldozr)</li>
    <li><b>text:</b> the text of the tweet (Lyx is cool)</li>
</ul>

In [86]:
filename = "../../data/training_XL.csv"
data_set = pd.read_csv(filename, delimiter=',', encoding='ISO-8859-1', header=None)

## Process Data

In [92]:
data_set.columns = ["target","ids","date","flag","user","text"]
#Shuffle the data (get sample frac=1 means that we will use 100% of data for sample)
data_set = data_set.sample(frac=1).reset_index(drop=True)
y=data_set['target'].values
X_dirt=data_set['text'].values
data_set.head()

numpy.ndarray

#### Regex Clean Data

In [98]:
#regex search pattern
regex = re.compile('[^a-zA-Z ]')

X = []
for i in X_dirt:
    t = regex.sub('', i)
    X.append(t)

X = np.asarray(X)

In [100]:
print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(1600000,)


## Prepare Data For Holdout Test
<p> Remember that (X = Text) and (y = Sentiment Score)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
print(X_train[0])
print(y_train[0])
print(X_test[0])
print(y_test[0])

(1072000,) (1072000,) (528000,) (528000,)
Loads of work but loads of sun  I have to be more disciplined now than I was at the weekend
0
no FoxW channel in hotel cant see my LAA live 
0


In [102]:
training_labels = set(y_train)
print(training_labels)
training_category_dist = np.unique(y_train, return_counts=True)
print(training_category_dist)

{0, 4}
(array([0, 4]), array([536297, 535703]))


## Vectorization

In [119]:
#  unigram boolean vectorizer, set minimum document frequency to 5
unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')

#  unigram term frequency vectorizer, set minimum document frequency to 5
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')

#  unigram and bigram term frequency vectorizer, set minimum document frequency to 5
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,3), min_df=5, stop_words='english')

#  unigram tfidf vectorizer, set minimum document frequency to 5
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', ngram_range=(1,2), use_idf=False, min_df=5, stop_words='english')


### Vectorize Training Data

In [120]:
# fit vocabulary in training documents and transform the training documents into vectors
X_train_vec = unigram_tfidf_vectorizer.fit_transform(X_train)
X_test_vec = unigram_tfidf_vectorizer.transform(X_test)

# check the content of a document vector
print(X_train_vec.shape)
print(X_train_vec[0].toarray())

# check the size of the constructed vocabulary
print(len(unigram_tfidf_vectorizer.vocabulary_))

# print out the first 10 items in the vocabulary
print(list(unigram_tfidf_vectorizer.vocabulary_.items())[:10])

# check word index in vocabulary
print(unigram_tfidf_vectorizer.vocabulary_.get('tempt'))

(1072000, 194866)
[[0. 0. 0. ... 0. 0. 0.]]
194866
[('loads', 96874), ('work', 188236), ('sun', 156233), ('disciplined', 37270), ('weekend', 183884), ('loads work', 96891), ('work loads', 188642), ('school', 141928), ('monday', 109917), ('summer', 155972)]
160013


## Training the MNB Model

#### Trouble breaking 77% Hold-out test accuracy, need 85% for production, try linear SVM or Bernoulli

In [121]:
# initialize the MNB model
nb_clf= MultinomialNB()
# use the training data to train the MNB model
nb_clf.fit(X_train_vec,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [122]:
feature_ranks = sorted(zip(nb_clf.feature_log_prob_[1], unigram_tfidf_vectorizer.get_feature_names()))
very_negative_features = feature_ranks[-10:]
v_pos_f = feature_ranks[:10]
print(very_negative_features)
print('\n',v_pos_f)

[(-5.569003212569699, 'time'), (-5.500984520362213, 'going'), (-5.428849628370262, 'lol'), (-5.369172669441582, 'like'), (-5.170577308216231, 'thanks'), (-5.153437617404654, 'day'), (-5.046491785012854, 'love'), (-4.865424418332758, 'just'), (-4.800694992869234, 'good'), (-4.6942253226489825, 'im')]

 [(-14.315145048950027, 'aaaaaaa'), (-14.315145048950027, 'aaaaaaaa'), (-14.315145048950027, 'aaaaaaaaa'), (-14.315145048950027, 'aaron carter'), (-14.315145048950027, 'aaroncarter random'), (-14.315145048950027, 'aaronfuller'), (-14.315145048950027, 'aarrgghh'), (-14.315145048950027, 'abdomen'), (-14.315145048950027, 'abdominal'), (-14.315145048950027, 'abdominal pain')]


## Evaluate MNB Model

#### Hold-out Test

In [123]:
nb_clf.score(X_test_vec,y_test)

0.776280303030303

#### Confusion Matrix

In [117]:
y_pred = nb_clf.fit(X_train_vec, y_train).predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=[0,4])
print(cm)

[[204291  59412]
 [ 60473 203824]]


#### Classification Report

In [118]:
print(precision_score(y_test, y_pred, average=None))
print(recall_score(y_test, y_pred, average=None))

from sklearn.metrics import classification_report
target_names = ['0','4']
print(classification_report(y_test, y_pred, target_names=target_names))

[0.77159659 0.77430139]
[0.77470108 0.77119301]
             precision    recall  f1-score   support

          0       0.77      0.77      0.77    263703
          4       0.77      0.77      0.77    264297

avg / total       0.77      0.77      0.77    528000



#### Cross Validation Testing

In [72]:
# cross validation
nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1',ngram_range=(1,2), min_df=5, stop_words='english' )),('nb', MultinomialNB())])
scores = cross_val_score(nb_clf_pipe, X, y, cv=3)
avg=sum(scores)/len(scores)
print(avg)

0.7737912494289798


## Use Model on Tweets

### Read in data

In [29]:
choose_file = '201811041600'

# Read in data
with open('../../data/ABX_Tweets/data_{}.json'.format(choose_file), 'r') as f:
    abx_dict = json.load(f)


### Create list of tweets to score

In [30]:
test_tweets = []
#i = 0
for r in abx_dict['results']:
    #print(str(i) + " : " + r['text']+"\n") # uncomment this line if you want to print the tweet text
    #i += 1
    test_tweets.append(r['text'])

print(test_tweets)

['#Eljaaly_cases\n63 yo M, Hx: seizure on valproic acid. Developed ventriculitis\U0001f9e0 after placement of extraventricular… https://t.co/1oGqOS87De', 'Working on the Syllabus for OB/GI for EM Review RedefinED:  #EMReviewRedefinED  #OhioACEP\n\nBoard Pearl: Abx to avoi… https://t.co/gLDKDiYzrG', 'ABX: Biceps/Triceps Superset. #abxfitness #paleogenicathlete #NjJugganaut #time2eat #resistancebandtraining… https://t.co/WzScGvNTx3', "RT @AsobiCoin: Buy over 1$ worth of ABX and get $10 QASH airdrop!?🤔\n\nYou can't miss this chance‼️😎😎\n\n1 Create an account at Liquid with thi…", '@xba_a_aa_a_abx まぁそうだよな\nそうなるとどうせPC利用するから無用の長物か', '#QRCODE Design-QR-Codes. Im Jahre 2008 kam ein findiger Kopf auf die Idee, die Fehlerkorrektur-Informationen von QR… https://t.co/5PSqwV1jdl', "RT @AsobiCoin: Limited supply‼️\nDon't miss this chance😆⬇️\n\n1 Create an account at Liquid with this link⬇️\nhttps://t.co/wYaGZiKg2i …\n\n2 Buy…", '@xba_a_aa_a_abx リツイート毎回ありがとうございます(^○^)', '$AAL $AAPL $ABX $ADUS $AGN $A

### Clean Tweets

In [31]:
#regex search pattern

regex = re.compile('[^a-zA-Z ]')

# emoji_pattern = re.compile(
#     u"(\ud83d[\ude00-\ude4f])|"  # emoticons
#     u"(\ud83c[\udf00-\uffff])|"  # symbols & pictographs (1 of 2)
#     u"(\ud83d[\u0000-\uddff])|"  # symbols & pictographs (2 of 2)
#     u"(\ud83d[\ude80-\udeff])|"  # transport & map symbols
#     u"(\ud83c[\udde0-\uddff])"  # flags (iOS)
#     "+", flags=re.UNICODE)


test_tweets_processed = []
for i in test_tweets:
    t = regex.sub('', i)
    test_tweets_processed.append(t)
print(test_tweets_processed)


['#Eljaalycases yo M Hx seizure on valproic acid Developed ventriculitis after placement of extraventricular httpstcooGqOSDe', 'Working on the Syllabus for OBGI for EM Review RedefinED  #EMReviewRedefinED  #OhioACEPBoard Pearl Abx to avoi httpstcogLDKDiYzrG', 'ABX BicepsTriceps Superset #abxfitness #paleogenicathlete #NjJugganaut #timeeat #resistancebandtraining httpstcoWzScGvNTx', 'RT AsobiCoin Buy over  worth of ABX and get  QASH airdropYou cant miss this chance Create an account at Liquid with thi', 'xbaaaaaabx PC', '#QRCODE DesignQRCodes Im Jahre  kam ein findiger Kopf auf die Idee die FehlerkorrekturInformationen von QR httpstcoPSqwVjdl', 'RT AsobiCoin Limited supplyDont miss this chance Create an account at Liquid with this linkhttpstcowYaGZiKgi  Buy', 'xbaaaaaabx ', 'AAL AAPL ABX ADUS AGN AGS ALLK ALLO AMD AN ARMO ATVI AVGO AZN BA BHC BJRI BKE BKNG BMCH BMRN httpstcoixAtUtB', 'isinhaabx O que aconteceu nega Vc t bem', 'abx    ', 'ABXGloria I feel the sameMany sound engineers are

### Vectorize Tweets

In [32]:
X_tweet_test = unigram_tfidf_vectorizer.transform(test_tweets_processed)

In [39]:
y_pred=nb_clf.predict(X_tweet_test)
output = open('../../data/predictions/{}_output.csv'.format(choose_file), 'w')
for x, value in enumerate(y_pred):
  output.write(str(value) + '\n') 
output.close()

mean_sentiscore = sum(y_pred)/len(y_pred)
print(mean_sentiscore)

2.8


### Do above for whole directory, return two lists (date and mean_sentiscore)

In [65]:
folder_name = "TSLA_Tweets"
file_list = os.listdir("../../data/{}".format(folder_name))

In [66]:
mean_list = []
pred_df_list = []

for i in file_list:
    # Read in data
    with open('../../data/{}/{}'.format(folder_name,i), 'r') as f:
        abx_dict = json.load(f)
    
    # Create list of tweet text
    test_tweets = []
    for r in abx_dict['results']:
        test_tweets.append(r['text'])
        
    # Use regex to clean data, return cleaned list as 'test_tweets_processed'
    regex = re.compile('[^a-zA-Z# ]')
    test_tweets_processed = []
    for z in test_tweets:
        t = regex.sub('', z)
        test_tweets_processed.append(t)
    
    # Vectorize list of tweets
    X_tweet_test = unigram_tfidf_vectorizer.transform(test_tweets_processed)
    
    # Apply model to tweets
    y_pred=nb_clf.predict(X_tweet_test)
    
    # Output results csv
#     output = open('../../data/predictions/{}_pred.csv'.format(i), 'w')
#     for x, value in enumerate(y_pred):
#       output.write(str(value) + '\n') 
#     output.close()

    mean_sentiscore = sum(y_pred)/len(y_pred)
    mean_list.append(mean_sentiscore)
    
    # create dataframe of prediction results and text
    model_out_df = pd.DataFrame(
    {'text': test_tweets_processed,
     'sentiment': y_pred,
    })
    
    # add dataframe with prediction results to list of dataframes
    pred_df_list.append(model_out_df)
        

#### Export CSV of metascores for all files and dates

In [67]:
# Create dataframe of sentiment means and file name for means
dates_only =  []
for x in file_list:
    dates_only.append(x.replace('.json', '').replace('data_', ''))

print (dates_only)

means_df = pd.DataFrame(
    {'filename': file_list,
     'sentiment': mean_list,
     'date' : dates_only
    })

means_df.to_csv("../../data/{}_mean_senti.csv".format(folder_name))

['201811301600', '201811041600', '201811051600', '201811191600', '201811181600', '201811271600', '201811261600', '201812011600', '201811131600', '201811121600', '201811201600', '201811211600', '201811141600', '201811151600', '201811091600', '201811081600', '201811101600', '201811111600', '201811241600', '201811251600', '201812031600', '201812021600', '201811071600', '201811061600', '201811171600', '201811161600', '201811291600', '201811281600', '201811231600', '201811221600']


In [68]:
training_labels = set(y_train)
print(training_labels)
from scipy.stats import itemfreq
training_category_dist = itemfreq(y_train)
print(training_category_dist)

{0, 4}
[[     0 535984]
 [     4 536016]]


`itemfreq` is deprecated and will be removed in a future version. Use instead `np.unique(..., return_counts=True)`
  after removing the cwd from sys.path.


In [126]:
pred_df_list[0].to_csv("test.csv")