In [27]:
import pandas as pd
import numpy as np

In [28]:
# Load the small dataset to see what it looks like
df_small = pd.read_csv("data/twitter_dataset_small_w_bart_preds.csv")
df_small.head(3)

Unnamed: 0,is_positive,id,datetime,user,message,bart_is_positive
0,0,2200003313,Tue Jun 16 18:18:13 PDT 2009,DEWGetMeTho77,@Nkluvr4eva My poor little dumpling In Holmde...,0.005163
1,0,1467998601,Mon Apr 06 23:11:18 PDT 2009,Young_J,I'm off too bed. I gotta wake up hella early t...,0.33943
2,0,2300049112,Tue Jun 23 13:40:12 PDT 2009,dougnawoschik,I havent been able to listen to it yet My spe...,0.065703


In [29]:
# We can start by looking at some basic stats of how many tweets are positive and how many are negative
positive_tweets = df_small[df_small.is_positive == 1]
negative_tweets = df_small[df_small.is_positive == 0]

# count tweets by looking at the number of rows
no_positive_tweets = positive_tweets.shape[0]
no_negative_tweets = negative_tweets.shape[0]

print('Number of positive tweets: {}'.format(no_positive_tweets))
print('Number of negative tweets: {}'.format(no_negative_tweets))

Number of positive tweets: 10058
Number of negative tweets: 9942


In [30]:
# We start our analysis by looking at positive tweets
positive_tweets.message[0:10].values

array(['on lunch....dj should come eat with me ',
       "@TamaraSchilling Adventure - That's what we all need in our life. I am glad you had a great week. Thanks for yr response ",
       "@PerezHilton Zach makes me pee sitting down! And I'm a grown gay man! ",
       'to sum up my day in one word ......... kackered! ',
       '@k9wkj Great minds think alike ',
       '@LochNessCullen thanks for the follow!  how are you nessie? ',
       '@SallyGreene @andrewtc04 @Syondeli thanks guys, added a couple of love handle inches after tayyabs ',
       "i'm feeling quite sleepy today, wish i could stay in bed today...but OK! is my LAST YEAR, so let's go to school ",
       'Whaddup Whaddup Whaddup Whaddup Whaddup  I Got white girl swag from MY HEAD TO MY shOES Whaddup @yungla ',
       '@tom_e_white http://my.safaribooksonline.com/9780596521974 -- congrats on getting out the Hadoop book (esp the ZooKeeper chapter  )'],
      dtype=object)

In [31]:
# We continue our analysis by looking at negative tweets
negative_tweets.message[0:10].values

array(['@Nkluvr4eva My poor little dumpling  In Holmdel vids he was really trying...Hope he dont try to hard tonight xx',
       "I'm off too bed. I gotta wake up hella early tomorrow morning. ",
       'I havent been able to listen to it yet  My speakers are busted',
       'now remembers why solving a relatively big equation with two unknowns is a total pain in the butt ',
       'Ate too much, feel sick ',
       'Tried to purchase a parked domain through GoDaddy. $70 down the drain  kind of like gambling...',
       'Just got back from VA Tech Equine Medical Center...my poor Lilly has to be on stall rest for at least another month ',
       "can't log in to my other twitter account. super bummed ",
       "is very tired and doesn't want to clean anymore ",
       "so sad...   tough game, went into PK's, screw you."], dtype=object)

In [55]:
# We should calculate accuracy of BART model to have some kind of benchmark to compare against
# Let's try different thresholds for the BART predictions to make to positive and negative
tresholds = np.arange(0.1,1,0.1)

for t in tresholds:
    df_small['bart_pred'] = np.where(df_small['bart_is_positive'] > t, 1, 0)
    df_small['match_y_bart_pred'] = df_small.bart_pred.eq(df_small.is_positive)
    bart_accuracy = df_small.match_y_bart_pred.mean()*100
    print('BART accuracty at threshold {}: {}'.format(t, bart_accuracy))

BART accuracty at threshold 0.1: 75.75
BART accuracty at threshold 0.2: 75.25500000000001
BART accuracty at threshold 0.30000000000000004: 74.14
BART accuracty at threshold 0.4: 72.885
BART accuracty at threshold 0.5: 71.00999999999999
BART accuracty at threshold 0.6: 69.11
BART accuracty at threshold 0.7000000000000001: 66.445
BART accuracty at threshold 0.8: 62.955000000000005
BART accuracty at threshold 0.9: 57.835
