In [1]:
# Imports. nothing to see here.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string

from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# EDA

In [2]:
# Import and inspect data

data = pd.read_csv('data.csv', encoding = "ISO-8859-1")

data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
# Read content. These are a dataset of Tweets from SXSW in Austin from 2011.

data.iloc[0,0]

'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [4]:
# Check for missing values in the primary column and remove the one we find.

print(data['tweet_text'].isna().sum())

data = data[~data['tweet_text'].isna()]

1


In [5]:
# Check for duplicated rows and preserve unique entries.

a = len(data)
data = data.drop_duplicates()
b = len(data)
print('# Number of duplicate rows dropped: {}'.format(a-b))

# Number of duplicate rows dropped: 22


In [6]:
# Explore and simplify. We're defining our project goals more closely here.

print(data['emotion_in_tweet_is_directed_at'].value_counts())

company = {'iPad': 'Apple',
            'Apple': 'Apple',
            'iPad or iPhone App': 'Apple',
            'Google': 'Google',
            'iPhone': 'Apple',
            'Other Google product or service': 'Google',
            'Android App': 'Google',
            'Android': 'Google',
            'Other Apple product or service': 'Apple'}
            
data['emotion_in_tweet_is_directed_at'] = data['emotion_in_tweet_is_directed_at'].map(company)

iPad                               945
Apple                              659
iPad or iPhone App                 469
Google                             428
iPhone                             296
Other Google product or service    293
Android App                         80
Android                             77
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64


In [7]:
# Simplify column names for convenience

data.rename(columns={'tweet_text': 'text', 'emotion_in_tweet_is_directed_at': 'brand', 'is_there_an_emotion_directed_at_a_brand_or_product': 'feelings'}, inplace=True)

In [8]:
# Here we inspect and encode the labels for our target column. We also define the boundaries of our task, narrowing our dataset to entries with clear positive or negative expressions.

print('Original dataset values:\n', data['feelings'].value_counts(), '\n')

feels = {'Negative emotion': 0,
        'Positive emotion': 1,
        'No emotion toward brand or product': 2,
        "I can't tell": 3}

data['feelings'] = data['feelings'].map(feels)

data = data[data['feelings'] <= 1]

print('Encoded and chosen dataset values:\n', data['feelings'].value_counts(), '\n')

print('Total entries:', len(data))

Original dataset values:
 No emotion toward brand or product    5375
Positive emotion                      2970
Negative emotion                       569
I can't tell                           156
Name: feelings, dtype: int64 

Encoded and chosen dataset values:
 1    2970
0     569
Name: feelings, dtype: int64 

Total entries: 3539


# Now the NLP begins.

In [9]:
# Create tools to process the Tweets.

tokenizer = RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
stops += list(string.punctuation)
stops.extend(['sxsw', 'sxswi', 'quot', 'mention', 'link', 'rt', 'amp', 'http', 'sxswrt', 'google', 'googles', 'app', 'apps', 'android', 'austin', 'quotgoogle', 'new', 'today', 'one', 'apple', 'ipad', 'iphone', 'ipad2', 'apples', 'quotapple','store'])

In [10]:
# Use our new tools!
############### Tokens still in list form. Must be addressed #############

data['tokens'] = data['text'].apply(tokenizer.tokenize)

data['tokens'] = data['tokens'].apply(lambda x: [word.lower() for word in x if word not in stops])
data['tokens'] = data['tokens'].apply(lambda x: [word.lower() for word in x if word not in stops])

data

Unnamed: 0,text,brand,feelings,tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,0,"[wesley83, 3g, 3, hrs, tweeting, rise_austin, ..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,1,"[jessedee, know, fludapp, awesome, likely, app..."
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,1,"[swonderlin, wait, 2, also, sale]"
3,@sxsw I hope this year's festival isn't as cra...,Apple,0,"[hope, year, festival, crashy, year]"
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1,"[sxtxstate, great, stuff, fri, marissa, mayer,..."
...,...,...,...,...
9077,@mention your PR guy just convinced me to swit...,Apple,1,"[pr, guy, convinced, switch, back, great, cove..."
9079,&quot;papyrus...sort of like the ipad&quot; - ...,Apple,1,"[papyrus, sort, like, nice, lol, lavelle]"
9080,Diller says Google TV &quot;might be run over ...,Google,0,"[diller, says, tv, might, run, playstation, xb..."
9085,I've always used Camera+ for my iPhone b/c it ...,Apple,1,"[always, used, camera, b, c, image, stabilizer..."


# Creating functions for user interface

In [11]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return [lemmatizer.lemmatize(w) for w in text]

def lemmatize():
    data['lemm'] = data['tokens'].apply(lemmatize_text)
    data['lemm'] = data['lemm'].apply(lambda x: ' '.join(x))

In [12]:
def CV(X_train, X_test):
    count_vectorizer = CountVectorizer()
    X_train_counts = count_vectorizer.fit_transform(X_train)
    X_test_counts = count_vectorizer.transform(X_test)
    return X_train_counts, X_test_counts

def tf_idf(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_counts = tfidf.fit_transform(X_train)
    X_test_counts = tfidf.transform(X_test)
    return X_test_counts, X_train_counts


In [13]:
def smote(X_train_counts, y_train):
    smote = SMOTE()
    X_train_counts, y_train = smote.fit_sample(X_train_counts, y_train)
    return X_train_counts, y_train

def TTS(col):
    X_train, X_test, y_train, y_test = train_test_split(data[col], data['feelings'])
    return X_train, X_test, y_train, y_test

In [14]:
def logreg(X_train_counts, y_train, X_test_counts):
    clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)
    clf.fit(X_train_counts, y_train)
    y_predicted_counts = clf.predict(X_test_counts)
    return y_predicted_counts

In [15]:
def rf(X_train_counts, y_train, X_test_counts):
    rf = RandomForestClassifier()
    rf.fit(X_train_counts, y_train)
    y_predicted_counts = rf.predict(X_test_counts)
    return y_predicted_counts

In [16]:
def multiNB(X_train_counts, y_train, X_test_counts):
    nb = MultinomialNB()
    nb.fit(X_train_counts, y_train)
    y_predicted_counts = nb.predict(X_test_counts)
    return y_predicted_counts

In [17]:
def classify(y_test, y_predicted_counts):
    print('Classification Report - TEST')
    print('--------------------------------------------------------------------------')
    print(classification_report(y_test, y_predicted_counts))
    # Confusion Matrix
    print('--------------------------------------------------------------------------')
    print('Confusion Matrix - TEST')
    print('--------------------------------------------------------------------------')
    print(pd.crosstab(y_test, y_predicted_counts, rownames=['True'], colnames=['Predicted'], margins=True))
    print('--------------------------------------------------------------------------')

In [18]:
def user_models(data):
    col = None
    model = None
    lemm = None
    smt = None
    jerk = "Don't waste my time. Try again wiseguy."

    print('How would you like to analyze the data?\nType "1" to Count Vectorize or "2" to implement TF-IDF.')
    model = input()
    if model not in ['1', '2']:
        return print(jerk)

    print('Would you like to lemmatize?\nType "1" for Yes or "2" for No.')
    lemm  = input()
    if lemm not in ['1', '2']:
        return print(jerk)

    print('Would you like to SMOTE?\nType "1" for Yes or "2" for No.')
    smt = input()
    if smt not in ['1','2']:
        return print(jerk)

    if lemm == '1':
        col = 'lemm'
        lemmatize()
    elif lemm == '2':
        col = 'text'

    X_train, X_test, y_train, y_test = TTS(col)
    # print(X_train)

    if model == '1':
        X_train_counts, y_test_counts = CV(X_train, X_test)
        data ['tokens'] = data['tokens'].apply(lambda x: ' '.join(x))
    elif model == '2':
        X_test_counts, X_train_counts = tf_idf(X_train, X_test)

    if smt == '1':
        X_train_counts, y_train = smote(X_train_counts, y_train)

    y_predicted_counts = multiNB(X_train_counts, y_train, X_test_counts)
    classify(y_test, y_predicted_counts)

In [19]:
user_models(data)

How would you like to analyze the data?
Type "1" to Count Vectorize or "2" to implement TF-IDF.
Would you like to lemmatize?
Type "1" for Yes or "2" for No.
Would you like to SMOTE?
Type "1" for Yes or "2" for No.
Classification Report - TEST
--------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      0.03      0.06       137
           1       0.85      1.00      0.92       748

    accuracy                           0.85       885
   macro avg       0.92      0.51      0.49       885
weighted avg       0.87      0.85      0.78       885

--------------------------------------------------------------------------
Confusion Matrix - TEST
--------------------------------------------------------------------------
Predicted  0    1  All
True                  
0          4  133  137
1          0  748  748
All        4  881  885
------------------------------------------------------------------

In [20]:
data

Unnamed: 0,text,brand,feelings,tokens
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,Apple,0,"[wesley83, 3g, 3, hrs, tweeting, rise_austin, ..."
1,@jessedee Know about @fludapp ? Awesome iPad/i...,Apple,1,"[jessedee, know, fludapp, awesome, likely, app..."
2,@swonderlin Can not wait for #iPad 2 also. The...,Apple,1,"[swonderlin, wait, 2, also, sale]"
3,@sxsw I hope this year's festival isn't as cra...,Apple,0,"[hope, year, festival, crashy, year]"
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,1,"[sxtxstate, great, stuff, fri, marissa, mayer,..."
...,...,...,...,...
9077,@mention your PR guy just convinced me to swit...,Apple,1,"[pr, guy, convinced, switch, back, great, cove..."
9079,&quot;papyrus...sort of like the ipad&quot; - ...,Apple,1,"[papyrus, sort, like, nice, lol, lavelle]"
9080,Diller says Google TV &quot;might be run over ...,Google,0,"[diller, says, tv, might, run, playstation, xb..."
9085,I've always used Camera+ for my iPhone b/c it ...,Apple,1,"[always, used, camera, b, c, image, stabilizer..."


In [21]:
X_train

NameError: name 'X_train' is not defined