In [73]:
import pandas as pd
import matplotlib.pyplot as plt

from textblob import TextBlob
import seaborn as sns


import re  
import nltk
from nltk.tokenize import word_tokenize
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk import FreqDist
from nltk.sentiment.vader import SentimentIntensityAnalyzer


nltk.download('wordnet')
nltk.download('vader_lexicon')

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/williamsa/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/williamsa/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [74]:
df = pd.read_csv('../data/judge_1377884607_tweet_product_company.csv')

In [75]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [76]:
df.rename(columns = {"is_there_an_emotion_directed_at_a_brand_or_product": "emotion"}, inplace = True)

In [77]:
df.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [78]:
df['emotion'].value_counts()

No emotion toward brand or product    5156
Positive emotion                      2869
Negative emotion                       545
I can't tell                           151
Name: emotion, dtype: int64

In [79]:
df = df.drop(df[df['emotion'] == "I can't tell"].index)

In [80]:
df['emotion'].value_counts()

No emotion toward brand or product    5156
Positive emotion                      2869
Negative emotion                       545
Name: emotion, dtype: int64

In [81]:
df['emotion_in_tweet_is_directed_at'].value_counts()

iPad                               906
Apple                              638
iPad or iPhone App                 451
Google                             411
iPhone                             287
Other Google product or service    281
Android App                         78
Android                             74
Other Apple product or service      34
Name: emotion_in_tweet_is_directed_at, dtype: int64

In [82]:
df['emotion_in_tweet_is_directed_at'].isna().sum()

5410

In [83]:
df.drop(columns = 'emotion_in_tweet_is_directed_at', inplace = True)

In [84]:
df.dropna(inplace = True)

In [85]:
df.head()

Unnamed: 0,tweet_text,emotion
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion


In [86]:
def count_chars(text):
    return len(text)
    

In [87]:
def count_words(text):
    return len(text.split())

In [88]:
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

In [89]:
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

In [90]:
def count_words_in_quotes(text):
    x = re.findall("'.'|''.'", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count

In [91]:
def count_sent(text):
    return len(nltk.sent_tokenize(text))

In [92]:
def count_unique_words(text):
    return len(set(text.split()))

In [93]:
def count_htags(text):
    x = re.findall(r'(#w[A-Za-z0-9]*)', text)
    return len(x) 

In [94]:
def count_mentions(text):
    x = re.findall(r'(@w[A-Za-z0-9]*)', text)
    return len(x)

In [95]:
def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

In [96]:
df['char_count'] = df["tweet_text"].apply(lambda x:count_chars(x))
df['word_count'] = df["tweet_text"].apply(lambda x:count_words(x))
df['sent_count'] = df["tweet_text"].apply(lambda x:count_sent(x))
df['capital_char_count'] = df["tweet_text"].apply(lambda x:count_capital_chars(x))
df['capital_word_count'] = df["tweet_text"].apply(lambda x:count_capital_words(x))
df['quoted_word_count'] = df["tweet_text"].apply(lambda x:count_words_in_quotes(x))
df['stopword_count'] = df["tweet_text"].apply(lambda x:count_stopwords(x))
df['unique_word_count'] = df["tweet_text"].apply(lambda x:count_unique_words(x))
df['htag_count'] = df["tweet_text"].apply(lambda x:count_htags(x))
df['mention_count'] = df["tweet_text"].apply(lambda x:count_mentions(x))
df['avg_wordlength'] = df['char_count']/df['word_count']
df['avg_sentlength'] = df['word_count']/df['sent_count']
df['unique_vs_words'] = df['unique_word_count']/df['word_count']
df['stopwords_vs_words'] = df['stopword_count']/df['word_count']

In [97]:
df

Unnamed: 0,tweet_text,emotion,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,127,23,5,15,4,0,7,21,0,1,5.521739,4.600000,0.913043,0.304348
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,139,22,3,10,1,0,7,22,0,0,6.318182,7.333333,1.000000,0.318182
2,@swonderlin Can not wait for #iPad 2 also. The...,Positive emotion,79,15,2,7,1,0,6,15,0,0,5.266667,7.500000,1.000000,0.400000
3,@sxsw I hope this year's festival isn't as cra...,Negative emotion,82,15,2,2,1,0,5,12,0,0,5.466667,7.500000,0.800000,0.333333
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,131,17,1,14,1,0,1,17,0,0,7.705882,17.000000,1.000000,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8716,Ipad everywhere. #SXSW {link},Positive emotion,29,4,2,5,1,0,0,4,0,0,7.250000,2.000000,1.000000,0.000000
8717,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product,125,18,1,4,1,0,2,18,0,0,6.944444,18.000000,1.000000,0.111111
8718,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product,145,19,3,9,2,0,2,19,0,0,7.631579,6.333333,1.000000,0.105263
8719,Some Verizon iPhone customers complained their...,No emotion toward brand or product,140,23,2,10,1,0,7,23,0,0,6.086957,11.500000,1.000000,0.304348


In [98]:
# the following code is courtesy of Dr. Praveen Gowtham
# additional argument sets cut off minimum length for tokenized text at which function converts to null string.
def process_tweet(tweet_text, min_length):
    
    # get common stop words that we'll remove during tokenization/text normalization
    stop_words = stopwords.words('english')
    stop_words += ['rt', 'link', 'apple', 'google', 'ipad', 'quot', 'iphone', 'app', 'android']

    #initialize lemmatizer
    wnl = WordNetLemmatizer()

    # helper function to change nltk's part of speech tagging to a wordnet format.
    def pos_tagger(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:         
            return None
   

    # lower case everything
    tweet_lower = tweet_text.lower()

    #remove mentions, hashtags, and urls, strip whitspace and breaks
    tweet_lower = re.sub(r"@[a-z0-9_]+|#[a-z0-9_]+|http\S+", "", tweet_lower).strip().replace("\r", "").replace("\n", "").replace("\t", "")
    
    
    # remove stop words and punctuations 
    tweet_norm = [x for x in word_tokenize(tweet_lower) if ((x.isalpha()) & (x not in stop_words)) ]

    #  POS detection on the result will be important in telling Wordnet's lemmatizer how to lemmatize
    
    # creates list of tuples with tokens and POS tags in wordnet format
    wordnet_tagged = list(map(lambda x: (x[0], pos_tagger(x[1])), pos_tag(tweet_norm))) 

    # now we are going to have a cutoff here. any tokenized cocument with length < min length will be removed from corpus
    if len(wordnet_tagged) <= min_length:
        return ''
    else:
         # rejoins lemmatized sentence 
         tweet_norm = " ".join([wnl.lemmatize(x[0], x[1]) for x in wordnet_tagged if x[1] is not None])
         return tweet_norm.split(" ")

In [99]:
df['tweet_tokens'] = df['tweet_text'].apply(process_tweet, args = [5])

In [100]:
df = df.drop(df[df['tweet_tokens'] == ""].index)

In [101]:
df

Unnamed: 0,tweet_text,emotion,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,tweet_tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,127,23,5,15,4,0,7,21,0,1,5.521739,4.600000,0.913043,0.304348,"[hrs, tweet, dead, need, upgrade, plugin, stat..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,139,22,3,10,1,0,7,22,0,0,6.318182,7.333333,1.000000,0.318182,"[know, awesome, likely, appreciate, design, al..."
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,131,17,1,14,1,0,1,17,0,0,7.705882,17.000000,1.000000,0.058824,"[great, stuff, fri, mayer, tim, tech, amp, mat..."
7,"#SXSW is just starting, #CTIA is around the co...",Positive emotion,138,28,1,8,2,0,15,24,0,0,4.928571,28.000000,0.857143,0.535714,"[start, corner, hop, skip, jump, good, time, fan]"
9,Counting down the days to #sxsw plus strong Ca...,Positive emotion,88,16,1,3,0,0,5,16,0,0,5.500000,16.000000,1.000000,0.312500,"[counting, day, strong, canadian, dollar, mean..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8713,I've always used Camera+ for my iPhone b/c it ...,Positive emotion,137,25,3,13,1,0,8,23,0,0,5.480000,8.333333,0.920000,0.320000,"[always, use, image, stabilizer, mode, suggest..."
8714,Google says: want to give a lightning talk to ...,No emotion toward brand or product,140,24,2,7,1,0,8,21,0,0,5.833333,12.000000,0.875000,0.333333,"[say, want, give, lightning, talk, audience, t..."
8717,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product,125,18,1,4,1,0,2,18,0,0,6.944444,18.000000,1.000000,0.111111,"[wave, buzz, interrupt, regularly, schedule, g..."
8718,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product,145,19,3,9,2,0,2,19,0,0,7.631579,6.333333,1.000000,0.105263,"[zeiger, physician, never, report, potential, ..."


In [102]:
df['emotion']

0                         Negative emotion
1                         Positive emotion
4                         Positive emotion
7                         Positive emotion
9                         Positive emotion
                       ...                
8713                      Positive emotion
8714    No emotion toward brand or product
8717    No emotion toward brand or product
8718    No emotion toward brand or product
8719    No emotion toward brand or product
Name: emotion, Length: 6102, dtype: object

In [103]:
labelencoder = LabelEncoder()

df['sentiment'] =  labelencoder.fit_transform(df['emotion'])

df

Unnamed: 0,tweet_text,emotion,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,tweet_tokens,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,127,23,5,15,4,0,7,21,0,1,5.521739,4.600000,0.913043,0.304348,"[hrs, tweet, dead, need, upgrade, plugin, stat...",0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,139,22,3,10,1,0,7,22,0,0,6.318182,7.333333,1.000000,0.318182,"[know, awesome, likely, appreciate, design, al...",2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,131,17,1,14,1,0,1,17,0,0,7.705882,17.000000,1.000000,0.058824,"[great, stuff, fri, mayer, tim, tech, amp, mat...",2
7,"#SXSW is just starting, #CTIA is around the co...",Positive emotion,138,28,1,8,2,0,15,24,0,0,4.928571,28.000000,0.857143,0.535714,"[start, corner, hop, skip, jump, good, time, fan]",2
9,Counting down the days to #sxsw plus strong Ca...,Positive emotion,88,16,1,3,0,0,5,16,0,0,5.500000,16.000000,1.000000,0.312500,"[counting, day, strong, canadian, dollar, mean...",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8713,I've always used Camera+ for my iPhone b/c it ...,Positive emotion,137,25,3,13,1,0,8,23,0,0,5.480000,8.333333,0.920000,0.320000,"[always, use, image, stabilizer, mode, suggest...",2
8714,Google says: want to give a lightning talk to ...,No emotion toward brand or product,140,24,2,7,1,0,8,21,0,0,5.833333,12.000000,0.875000,0.333333,"[say, want, give, lightning, talk, audience, t...",1
8717,"Wave, buzz... RT @mention We interrupt your re...",No emotion toward brand or product,125,18,1,4,1,0,2,18,0,0,6.944444,18.000000,1.000000,0.111111,"[wave, buzz, interrupt, regularly, schedule, g...",1
8718,"Google's Zeiger, a physician never reported po...",No emotion toward brand or product,145,19,3,9,2,0,2,19,0,0,7.631579,6.333333,1.000000,0.105263,"[zeiger, physician, never, report, potential, ...",1


In [104]:
df.head()

Unnamed: 0,tweet_text,emotion,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,tweet_tokens,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,127,23,5,15,4,0,7,21,0,1,5.521739,4.6,0.913043,0.304348,"[hrs, tweet, dead, need, upgrade, plugin, stat...",0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,139,22,3,10,1,0,7,22,0,0,6.318182,7.333333,1.0,0.318182,"[know, awesome, likely, appreciate, design, al...",2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,131,17,1,14,1,0,1,17,0,0,7.705882,17.0,1.0,0.058824,"[great, stuff, fri, mayer, tim, tech, amp, mat...",2
7,"#SXSW is just starting, #CTIA is around the co...",Positive emotion,138,28,1,8,2,0,15,24,0,0,4.928571,28.0,0.857143,0.535714,"[start, corner, hop, skip, jump, good, time, fan]",2
9,Counting down the days to #sxsw plus strong Ca...,Positive emotion,88,16,1,3,0,0,5,16,0,0,5.5,16.0,1.0,0.3125,"[counting, day, strong, canadian, dollar, mean...",2


In [105]:
pd.to_pickle(df, '../data/df.pkl')

In [106]:
processed_df = df

In [107]:
processed_df.head()

Unnamed: 0,tweet_text,emotion,char_count,word_count,sent_count,capital_char_count,capital_word_count,quoted_word_count,stopword_count,unique_word_count,htag_count,mention_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words,tweet_tokens,sentiment
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Negative emotion,127,23,5,15,4,0,7,21,0,1,5.521739,4.6,0.913043,0.304348,"[hrs, tweet, dead, need, upgrade, plugin, stat...",0
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Positive emotion,139,22,3,10,1,0,7,22,0,0,6.318182,7.333333,1.0,0.318182,"[know, awesome, likely, appreciate, design, al...",2
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Positive emotion,131,17,1,14,1,0,1,17,0,0,7.705882,17.0,1.0,0.058824,"[great, stuff, fri, mayer, tim, tech, amp, mat...",2
7,"#SXSW is just starting, #CTIA is around the co...",Positive emotion,138,28,1,8,2,0,15,24,0,0,4.928571,28.0,0.857143,0.535714,"[start, corner, hop, skip, jump, good, time, fan]",2
9,Counting down the days to #sxsw plus strong Ca...,Positive emotion,88,16,1,3,0,0,5,16,0,0,5.5,16.0,1.0,0.3125,"[counting, day, strong, canadian, dollar, mean...",2


In [108]:
#This is an extremely important cell that must remain in the final version
la_vaca = 'mooo'