In [296]:
import pandas as pd
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split , cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, recall_score
pd.set_option ('display.max_colwidth', None)

In [149]:
df = pd.read_csv('data/judge-1377884607_tweet_product_company.csv',encoding='latin1')
df.isna().sum()

tweet_text                                               1
emotion_in_tweet_is_directed_at                       5802
is_there_an_emotion_directed_at_a_brand_or_product       0
dtype: int64

There are 5802 missing values telling what the tweet is directed at. A way to combat this would be to look at the tweet and look for keywords such as iphone or ipad and assign who it is directed to that way.

In [150]:
#Take out puncuation and other unnecessary characters
df['tweet_text'] = df['tweet_text'].str.replace(r'[^\w\s]', '')
df['tweet_text'] = df['tweet_text'].str.replace('quot', '')
# Make lower case and split
df['tweet_text'] = df['tweet_text'].str.lower()
df['tweet_text'] = df['tweet_text'].str.split(' ')
# Drop the 1 NA tweet we have
df = df.dropna(subset=['tweet_text'])

Here we do some basic cleaning of the data set, removing puncuation making everything lower case and splitting the data.

In [151]:
# Make a blank column to set up for some data cleaning
df['test'] = ''
# Fill in the values with a string na to avoid errors
df.fillna('na',inplace=True)


In [152]:
# This salvages alot of our data

apple = ['iphone','ipad','apple','apples','ipads']
android = ['android','google','androids','googles']

# Loop through the tweet column and look for keywords to assign product to
for ind , val in enumerate(df['tweet_text']):
    if df['emotion_in_tweet_is_directed_at'].values[ind] == 'na':
        for y in val:
            if y in apple:
                df['test'].values[ind] = y
            elif y in android:
                df['test'].values[ind] = y
                
    else:
        df['test'].values[ind] = df['emotion_in_tweet_is_directed_at'].values[ind]
      

Since there were such a high amount of NA values in the emotion tweeted at column, we thought it was best to try to salvage as much data as possible. We looked for keywords in the tweets and assigned a product to them based on what they 

In [153]:
#make everything lower case to make values match
df['test'] = df['test'].map(lambda x : x.lower())
df['test'].unique()

array(['iphone', 'ipad or iphone app', 'ipad', 'google', 'android',
       'apple', 'android app', 'other google product or service', '',
       'googles', 'ipads', 'apples', 'other apple product or service',
       'androids'], dtype=object)

In [154]:
#This cell replaces words with the rightful product to reduce unique values
df['test'] = df['test'].replace({'googles': 'google', 'apples': 'apple', 'androids': 'android','ipads':'ipad'})

In [155]:
df['test'].unique()

array(['iphone', 'ipad or iphone app', 'ipad', 'google', 'android',
       'apple', 'android app', 'other google product or service', '',
       'other apple product or service'], dtype=object)

We went from 5802 NA values to almost under 900! The remainder tweets do not mention anything about either product, and therefore are not useful for our analysis.

In [156]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].unique()

array(['Negative emotion', 'Positive emotion',
       'No emotion toward brand or product', "I can't tell"], dtype=object)

Looking at the target of our dataset, there seems to be four different values. Negative emotion, positive emotion, no emotion and I can't tell. Let's get a closer look at these 'I can't tell' labels.

In [157]:
df.loc[df['is_there_an_emotion_directed_at_a_brand_or_product'] == "I can't tell"]

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,test
90,"[thanks, to, mention, for, publishing, the, news, of, mention, new, medical, apps, at, the, sxswi, conf, blog, link, sxsw, sxswh]",na,I can't tell,
102,"[ûïmention, apple, has, opened, a, popup, store, in, austin, so, the, nerds, in, town, for, sxsw, can, get, their, new, ipads, link, wow]",na,I can't tell,ipad
237,"[just, what, america, needs, rt, mention, google, to, launch, major, new, social, network, called, circles, possibly, today, link, sxsw]",na,I can't tell,google
341,"[the, queue, at, the, apple, store, in, austin, is, four, blocks, long, crazy, stuff, sxsw]",na,I can't tell,apple
368,"[hope, its, better, than, wave, rt, mention, buzz, is, googles, previewing, a, social, networking, platform, at, sxsw, link]",na,I can't tell,google
...,...,...,...,...
9020,"[its, funny, watching, a, room, full, of, people, hold, their, ipad, in, the, air, to, take, a, photo, like, a, room, full, of, tablets, staring, you, down, sxsw]",na,I can't tell,ipad
9032,"[mention, yeah, we, have, mention, , google, has, nothing, on, us, , sxsw]",na,I can't tell,google
9037,"[mention, yes, the, google, presentation, was, not, exactly, what, i, was, expecting, sxsw]",na,I can't tell,google
9058,"[do, you, know, what, apple, is, really, good, at, making, you, feel, bad, about, your, xmas, present, , seth, meyers, on, ipad2, sxsw, doyoureallyneedthat]",na,I can't tell,apple


Since this makes up such a small amount of our data, we will move the I can't tell value to that of neutral, along with the value 'No emotion toward brand or product'.

In [None]:
emotion_dict = {'No emotion toward brand or product' : 'Neutral emotion', "I can't tell": 'Neutral emotion', 'Positive emotion': 'Positive emotion',
               'Negative emotion': 'Negative emotion'}
df['is_ther'] = df['Emotion'].map(emotion_dict)


In [159]:
emotion_dict = {'No emotion toward brand or product' : 'Neutral emotion', "I can't tell": 'Neutral emotion', 'Positive emotion': 'Positive emotion',
               'Negative emotion': 'Negative emotion'}
df['is_there_an_emotion_directed_at_a_brand_or_product'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(emotion_dict)


In [160]:
df['is_there_an_emotion_directed_at_a_brand_or_product'].unique()

array(['Negative emotion', 'Positive emotion', 'Neutral emotion'],
      dtype=object)

In [161]:
df['product'] = df['test'].map(company_search)


In [162]:
def company_search(test):
    if test == 'iphone':
        return 'apple'
    elif test == 'apple':
        return 'apple'
    elif test == 'ipad':
        return 'apple'
    elif test == 'ipad or iphone app':
        return 'apple'
    elif test == 'itunes':
        return 'apple'
    elif test == 'other apple product or service':
        return 'apple'
    elif test == 'ios':
        return 'apple'
    elif test == 'ipadquot':
        return 'apple'
    elif test == 'applequot':
        return 'apple'
    elif test == 'iphonequot':
        return 'apple'
    elif test == 'quotiphone':
        return 'apple'
    elif test == 'iosquot':
        return 'apple'
    elif test =='quotipad':
        return 'apple'
    elif test == 'quotapple':
        return 'apple'
    elif test =='google':
        return 'google'
    elif test == 'android':
        return 'google'
    elif test == 'other google product or service':
        return 'google'
    elif test == 'android app':
        return 'google'
    elif test == 'quotgoogle':
        return 'google'
    elif test == 'googlequot':
        return 'google'
    elif test == 'androidquot':
        return 'google'
    else:
        return 'unknown'

In [163]:
df.loc[df['product'] == 'unknown']

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product,test,product
51,"[ûïmention, link, lt, help, me, forward, this, doc, to, all, anonymous, accounts, techiesamp, ppl, who, can, help, us, jam, libya, sxsw]",na,Neutral emotion,,unknown
52,"[¼, what, _, link, ã_, edchat, musedchat, sxsw, sxswi, classical, newtwitter]",na,Neutral emotion,,unknown
53,"[mention, mention, on, the, locationbased, fast, fun, and, future, , link, via, mention, sxsw]",na,Neutral emotion,,unknown
66,"[at, sxsw, mention, , mention, wanna, buy, you, a, drink, 7pm, at, fado, on, 4th, link, join, us]",na,Neutral emotion,,unknown
71,"[chilcott, mention, sxsw, stand, talking, with, blogger, staff, too, late, to, win, competition, for, best, tweet, mentioning, mention, so, no, tshirt]",na,Neutral emotion,,unknown
...,...,...,...,...,...
8932,"[z6, no, news, is, good, news, link, codes, valid, 40075959p, 031111, infektd, sxsw, zlf]",na,Neutral emotion,,unknown
8936,"[client, news, mention, releases, dope, melodies, amp, heavy, bass, amp, invades, sxsw, gt, link]",na,Neutral emotion,,unknown
8970,"[this, is, my, 5th, year, downloading, the, sxsw, music, torrent, link, all, free, and, legal, great, music]",na,Neutral emotion,,unknown
9024,"[by, the, way, were, looking, for, a, spanishspeaking, trend, scout, based, in, austin, gt, link, sxsw]",na,Neutral emotion,,unknown


# Prepare for Testing

In [204]:
target_dict = {'Negative emotion': 0, 'Neutral emotion': 1, 'Positive emotion': 2}
df_test = df.copy()
df_test['target'] = df['is_there_an_emotion_directed_at_a_brand_or_product'].map(target_dict)

In [210]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8177 entries, 0 to 9092
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweet_text  8177 non-null   object
 1   product     8177 non-null   object
 2   target      8177 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 255.5+ KB


In [206]:
df_test = df_test[df_test['product'] != 'unknown']

In [207]:

df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8177 entries, 0 to 9092
Data columns (total 6 columns):
 #   Column                                              Non-Null Count  Dtype 
---  ------                                              --------------  ----- 
 0   tweet_text                                          8177 non-null   object
 1   emotion_in_tweet_is_directed_at                     8177 non-null   object
 2   is_there_an_emotion_directed_at_a_brand_or_product  8177 non-null   object
 3   test                                                8177 non-null   object
 4   product                                             8177 non-null   object
 5   target                                              8177 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 447.2+ KB


In [208]:
df_test.drop(['emotion_in_tweet_is_directed_at','test','is_there_an_emotion_directed_at_a_brand_or_product'],axis=1,inplace=True)


In [237]:
X = df_test['tweet_text']
y = df_test['target']

In [269]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y)


In [270]:
X_train = X_train.str.join(' ')
X_test = X_test.str.join(' ')

In [271]:
y_train

1079    1
7146    1
3567    1
8667    1
588     1
       ..
6200    1
8071    1
4641    1
6180    2
142     2
Name: target, Length: 6132, dtype: int64

In [272]:
X_train

1079                                                                                                 ive seen several people working on ipad 2s at sxsw today
7146                                  sxsw rumor google employees are given a travel budget per city if they use less they can bank the diff for future trips
3567                       mention i try to stay awake d im trying to get ustream working on iphone see your badgepicture now is dat right kickoffparty  sxsw
8667                                                                                     google launching new social network called cicles at sxsw today link
588                                                        diller on google tv the first product wasnt good it wasnt a consumer product basically sxsw diller
                                                                                ...                                                                          
6200    rt mention just a friendly reminder for sxsw

In [273]:
cv = CountVectorizer()

X_t_vec = cv.fit_transform(X_train)

X_t_vec = cv.fit_transform(X_train)
X_t_vec = pd.DataFrame.sparse.from_spmatrix(X_t_vec)
X_t_vec.columns = sorted(cv.vocabulary_)
X_t_vec.set_index(y_train.index, inplace=True)

In [274]:
X_val_vec = cv.transform(X_test)
X_val_vec  = pd.DataFrame.sparse.from_spmatrix(X_val_vec)
X_val_vec.columns = sorted(cv.vocabulary_)
X_val_vec.set_index(y_test.index, inplace=True)

In [275]:
mnb = MultinomialNB()

In [276]:
mnb.fit(X_t_vec,y_train)

MultinomialNB()

In [285]:
y_hat = mnb.predict(X_t_vec)


In [286]:
cm = confusion_matrix(y_train, y_hat)
cm

array([[ 166,  163,   95],
       [  21, 3190,  277],
       [   2,  516, 1702]], dtype=int64)

In [283]:
def metrics(y_true,y_preds):
    '''Gives recall,precision,accuracy,F1 score and confusion matrix'''
    return print(f'''Recall Score: {recall_score(y_true,y_preds)}
Precision Score: {precision_score(y_true,y_preds)}
Accuracy Score:{accuracy_score(y_true,y_preds)}
F1 Score: {f1_score(y_true,y_preds)}
Confusion Matrix: 
{confusion_matrix(y_true,y_preds)}''')


In [295]:
accuracy_score(y_train,y_hat)

0.824853228962818

In [297]:
cross_validate(mnb,X_t_vec,y_train)

{'fit_time': array([2.48398972, 2.51967573, 2.24389744, 2.24685454, 2.53409481]),
 'score_time': array([0.37696266, 0.36453629, 0.64675856, 0.37896633, 0.36654329]),
 'test_score': array([0.64221679, 0.64384678, 0.63621533, 0.65579119, 0.65497553])}