In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

import string
import re
from collections import defaultdict

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,\
HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix

#from src.confusion import plot_confusion_matrix

In [2]:
df = pd.read_csv('phase-4-dataset.csv', encoding= 'unicode_escape')
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


## How many rows are we working with?

In [3]:
df.shape

(9093, 3)

Making the label column name more managable, I mean come on!

In [4]:
df.rename(columns={"is_there_an_emotion_directed_at_a_brand_or_product" : "emotion"}, inplace=True)

Let's transform the words of that column into numbers for ourselves and rename it while were at it

In [5]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df['emotion'])
df['label'] = le.transform(df['emotion'])
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,emotion,label
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion,1
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion,3
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion,3
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion,1
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion,3


Seeing how many labels we're working with here

In [6]:
df.label.unique()

array([1, 3, 2, 0])

## any nulls?

In [7]:
df.isnull().sum().sum()

5803

### Let's see where all the 5903 nulls are

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9093 entries, 0 to 9092
Data columns (total 4 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   tweet_text                       9092 non-null   object
 1   emotion_in_tweet_is_directed_at  3291 non-null   object
 2   emotion                          9093 non-null   object
 3   label                            9093 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 284.3+ KB


### taking a look at a random tweet from the corpus

In [9]:
df.loc[43].tweet_text

'Mashable! - The iPad 2 Takes Over SXSW [VIDEO] #ipad #sxsw #gadgets {link}'

### Let's see how it compares to one further down the rows

In [10]:
df.loc[2345].tweet_text

'@mention check it. RT @mention #SXSW FREE App Festival Explorer: find the bands you want to see from your music tastes {link}'

### What is the target for tweet 2345?

In [11]:
df.loc[2345].emotion

'Positive emotion'

### Let's look at another random tweet followed by its target

In [12]:
df.loc[6543].tweet_text

'RT @mention RT @mention Google to Launch Major New Social Network Called Circles, {link} #sxsw #nptech'

In [13]:
df.loc[6543].emotion

'Positive emotion'

## It looks like whenever someone types the "@" symbol it is translated to the phrase "@mention" instead of "@-the-person-or-company." Probably this is to protect the public so we can use this corpus and its documents freely

#### 'RT' Seems to indicate the document is a retweet

### Getting ready to clean the data

In [14]:
sample_tweet = df.loc[68].tweet_text

#### Split our sample tweet

In [15]:
sample_tweet.split()

['Boooo!',
 'RT',
 '@mention',
 'Flipboard',
 'is',
 'developing',
 'an',
 'iPhone',
 'version,',
 'not',
 'Android,',
 'says',
 '@mention',
 '#sxsw']

## Let's create a function that removes those pesky @ symbols and the like

In [16]:
def remove_junk(text):
    text = re.sub(r'@[A-Za-z0-9#]+', '', str(text)) #remove @mentions
    text = re.sub(r'RT[\s]+', '', str(text)) # remove RT
    text = re.sub(r'\[VIDEO\]', '', str(text)) # remove [VIDEO] describer
    text = re.sub(r'\{link\}', '', str(text)) # remove {link} describer
    text = re.sub(r'https?:\/\/\S+', '', text) # removes actual links
    text = re.sub(r'[\,\.\?\*\$\'\"\!\(\)\:\_\/\-\=\^\;]+', '', text)
    return text
    
    

### Now to remove all of those @mentions

In [17]:
df['tweet_text'] = df['tweet_text'].apply(remove_junk)

In [18]:
df.head(10)

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,emotion,label
0,I have a 3G iPhone After 3 hrs tweeting at #R...,iPhone,Negative emotion,1
1,Know about Awesome iPadiPhone app that youl...,iPad or iPhone App,Positive emotion,3
2,Can not wait for #iPad 2 also They should sal...,iPad,Positive emotion,3
3,I hope this years festival isnt as crashy as ...,iPad or iPhone App,Negative emotion,1
4,great stuff on Fri #SXSW Marissa Mayer Google...,Google,Positive emotion,3
5,New iPad Apps For #SpeechTherapy And Communic...,,No emotion toward brand or product,2
6,,,No emotion toward brand or product,2
7,#SXSW is just starting #CTIA is around the cor...,Android,Positive emotion,3
8,Beautifully smart and simple idea wrote about ...,iPad or iPhone App,Positive emotion,3
9,Counting down the days to #sxsw plus strong Ca...,Apple,Positive emotion,3


In [19]:
df.tweet_text[23]

'Photo Just installed the #SXSW iPhone app which is really nice '

## Let's see if {link} was removed from the above tweet 6543

In [20]:
df.loc[6543].tweet_text

'Google to Launch Major New Social Network Called Circles  #sxsw #nptech'

### Okay, It's still there, along with 'photo' and some other possible special identifiers, we'll get rid of them 

In [21]:
df.tweet_text[35]

'At #sxsw  Oooh Google to Launch Major New Social Network Called Circles Possibly Today '

## Let's write a function that prints out the entire tweet, so we can see any hidden pitfalls in the form of characters

In [22]:
def full_tweet(num):
    return df.tweet_text[num]

In [23]:
print(full_tweet(76))
print(full_tweet(999))
print(full_tweet(31))
print(full_tweet(1005))
print(full_tweet(5555))
print(full_tweet(7213))
print(full_tweet(9003))
print(full_tweet(6034))
print(full_tweet(2229))
print(full_tweet(75))
print(full_tweet(643))
print(full_tweet(49))
print(full_tweet(8520))
print(full_tweet(5688))
print(full_tweet(489))
print(full_tweet(6254))
print(full_tweet(967))
print(full_tweet(4170))
print(full_tweet(7742))
print(full_tweet(5657))
print(full_tweet(8437))
print(full_tweet(2240))
print(full_tweet(1006))
print(full_tweet(543))
print(full_tweet(993))
print(full_tweet(6079))

I love my  iPhone case from #Sxsw but I cant get my phone out of it #fail
New post Per this rumor Google may preview its big social strategy at an 80sthemed costume party at #SXSW Yep 
You must have this app for your iPad if you are going to #SXSW  #hollergram
I need to play this game on my #android  #SXSW 
Best iPad design Big chunky elements Generous space Clarity trumps density Tap quality trumps tap quantity #tapworthy #sxsw
Apple announces popup store at #SXSW See our take on the future of popup stores here 
Austin my pal  lost her phone please help I left my WHITE iPhone 4G in a cab in Austin at #SXSW Internet please help
Horror stories from women in tech  #sxsw #9to5
Marissa Mayer Google maps should have better customer service quicker responses #sxsw #FH
Holla At google party Best ever Get your butt over here #sxsw
Ok nerds I totally am Bateman You want me to sign your iPad 2 #sxsw
wooooo ÛÏ Apple store downtown Austin open til Midnight #sxswÛ
Mom  quick  apply for a job  th

#### looks like there is   + , Û , Ï , ^ , ; ,  And that's just in the tweets we can see

In [24]:
df['tweet_text'][9092]

'\x8cÏ¡\x8eÏà\x8aü\x8b\x81Ê\x8b\x81Î\x8b\x81Ò\x8b\x81£\x8b\x81Á\x8bââ\x8b\x81\x8b\x81£\x8b\x81\x8f\x8bâ\x8bÛâGoogle Tests \x89ÛÏCheckin Offers\x89Û\x9d At #SXSW '