In [8]:
from joblib import Parallel, delayed
import multiprocessing as mp
import pandas as pd
import random
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

In [2]:
random.seed(20191024)

In [5]:
sentiment140 = pd.read_csv('../data/sentiment140_clean.csv', encoding='ISO-8859-1')
sentiment140.head()

Unnamed: 0,sentiment,text
0,2,follow @MonicaAyesha such a prat she is ha lo...
1,2,@AnnelieSch26 well that is a very good quote
2,2,@headstop Thanks
3,2,half hour til the movie awards so excited!!
4,0,jager bombs by myself


In [6]:
# Sample 500 records for model test set
sample = sentiment140.sample(500)

In [10]:
%%time
# Analyze sentiments in parallel
def process_sentiment(tweet):
    return TextBlob(tweet, analyzer=NaiveBayesAnalyzer()).sentiment[0]

n_cores = mp.cpu_count()
sentiments = Parallel(n_jobs=n_cores, verbose=2)(delayed(process_sentiment)(tweet) for tweet in sample['text'])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   38.1s
[Parallel(n_jobs=8)]: Done 146 tasks      | elapsed:  2.9min
[Parallel(n_jobs=8)]: Done 349 tasks      | elapsed:  7.0min


CPU times: user 1.14 s, sys: 180 ms, total: 1.32 s
Wall time: 9min 57s


[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed: 10.0min finished


In [14]:
sample['pred']=[0 if sent=='neg' else 2 if sent=='pos' else 1 for sent in sentiments]

In [20]:
accuracy_baseline = 1.0 * sum(sample['sentiment']==2) / len(sample)
accuracy_model = 1.0 * sum(sample['sentiment']==sample['pred']) / len(sample)

In [22]:
print('Accuracy of {:.2%} when predicting all positives.'.format(accuracy_baseline))
print('Model accuracy of {:.2%}'.format(accuracy_model))

Accuracy of 50.80% when predicting all positives.
Model accuracy of 54.80%
