In [1]:
# Imports. Nothing to see here.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import string

from imblearn.over_sampling import SMOTE
from gensim.models import Word2Vec
from collections import Counter

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import FreqDist, word_tokenize
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, plot_confusion_matrix, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

import pickle

# EDA

### Here we check out the data. We explore, look for inconsistencies, and use what we see to define our modeling plan. We start with general data cleaning practice, checking for duplicates and missing values. As we get deeper into the data, our techniques get more specific to the data.

In [2]:
# Import and inspect data

data = pd.read_csv('data.csv', encoding = "ISO-8859-1")
raw = pd.read_csv('data.csv', encoding = "ISO-8859-1")

data.head()

Unnamed: 0,tweet_text,emotion_in_tweet_is_directed_at,is_there_an_emotion_directed_at_a_brand_or_product
0,.@wesley83 I have a 3G iPhone. After 3 hrs twe...,iPhone,Negative emotion
1,@jessedee Know about @fludapp ? Awesome iPad/i...,iPad or iPhone App,Positive emotion
2,@swonderlin Can not wait for #iPad 2 also. The...,iPad,Positive emotion
3,@sxsw I hope this year's festival isn't as cra...,iPad or iPhone App,Negative emotion
4,@sxtxstate great stuff on Fri #SXSW: Marissa M...,Google,Positive emotion


In [3]:
# Read content. These are a dataset of Tweets from SXSW in Austin from 2011.

data.iloc[0,0]


'.@wesley83 I have a 3G iPhone. After 3 hrs tweeting at #RISE_Austin, it was dead!  I need to upgrade. Plugin stations at #SXSW.'

In [4]:
# Check for missing values in the primary column and remove the one we find.

print(data['tweet_text'].isna().sum())

data = data[~data['tweet_text'].isna()]

1


In [5]:
# Check for duplicated rows and preserve unique entries.

a = len(data)
data = data.drop_duplicates()
b = len(data)
print('# Number of duplicate rows dropped: {}'.format(a-b))

# Number of duplicate rows dropped: 22


In [6]:
# Explore and simplify. We're defining our project goals more closely here.

print(data['emotion_in_tweet_is_directed_at'].value_counts())

company = {'iPad': 'Apple',
            'Apple': 'Apple',
            'iPad or iPhone App': 'Apple',
            'Google': 'Google',
            'iPhone': 'Apple',
            'Other Google product or service': 'Google',
            'Android App': 'Google',
            'Android': 'Google',
            'Other Apple product or service': 'Apple'}
            
data['emotion_in_tweet_is_directed_at'] = data['emotion_in_tweet_is_directed_at'].map(company)

iPad                               945
Apple                              659
iPad or iPhone App                 469
Google                             428
iPhone                             296
Other Google product or service    293
Android App                         80
Android                             77
Other Apple product or service      35
Name: emotion_in_tweet_is_directed_at, dtype: int64


In [7]:
# Simplify column names for convenience.

data.rename(columns={'tweet_text': 'text', 'emotion_in_tweet_is_directed_at': 'brand', 'is_there_an_emotion_directed_at_a_brand_or_product': 'feelings'}, inplace=True)

In [8]:
# Here we inspect and encode the labels for our target column. We also define the boundaries of our task, narrowing our dataset to entries with clear positive or negative expressions.

print('Original dataset values:\n', data['feelings'].value_counts(), '\n')

feels = {'Negative emotion': 0,
        'Positive emotion': 1,
        'No emotion toward brand or product': 2,
        "I can't tell": 3}

data['feelings'] = data['feelings'].map(feels)

data = data[data['feelings'] <= 1]

print('Encoded and chosen dataset values:\n', data['feelings'].value_counts(), '\n')

print('Total entries:', len(data))

Original dataset values:
 No emotion toward brand or product    5375
Positive emotion                      2970
Negative emotion                       569
I can't tell                           156
Name: feelings, dtype: int64 

Encoded and chosen dataset values:
 1    2970
0     569
Name: feelings, dtype: int64 

Total entries: 3539


# Now the NLP Begins

### Our data is clean and ready to be processed. Now we process. We start by creating tools, then use the tools on the Tweet text to reshape the data into a manageable and meaningful form.

In [9]:
# Create tools to process the Tweets. We use Regex to narrow our data to words, and stopwords to return only the words that are significant.

tokenizer = RegexpTokenizer(r'\w+')

stops = stopwords.words('english')
stops += list(string.punctuation)
stops.extend(['sxsw', 'sxswi', 'quot', 'mention', 'link', 'rt', 'amp', 'http', 'sxswrt', 'google', 'googles', 'app', 'apps', 'android', 'austin', 'quotgoogle', 'new', 'today', 'one', 'apple', 'ipad', 'iphone', 'ipad2', 'apples', 'quotapple','store'])

In [10]:
# Use our new tools!

data['tokens'] = data['text'].apply(tokenizer.tokenize)

# This is inefficient and can be worked to be better. First pass eliminates stopwords that are already lowercase, and turns the rest of the words lowercase. Second pass removes the remaining stopwords. It also leaves our data in list form, which can be an issue later. Our lemmatizing function is built to work with a list. We fix the issue before we Count Vectorize or TD-IDF in the main function below.
data['tokens'] = data['tokens'].apply(lambda x: [word.lower() for word in x if word not in stops])
data['tokens'] = data['tokens'].apply(lambda x: [word.lower() for word in x if word not in stops])

data
copy = data

# Creating Functions for User Interface

### It is important that we create functions for each of the modeling steps. We will be selecting which ones to use individually later and they all need to be defined before we know which ones will be used. It is extra important that they all work well individually and also do not interfere with each other.

In [11]:
# Count Vectorize or TF-IDF. Our first and most vital choice.
def CV(X_train, X_test):
    count_vectorizer = CountVectorizer()
    X_train_counts = count_vectorizer.fit_transform(X_train)
    X_test_counts = count_vectorizer.transform(X_test)
    return X_train_counts, X_test_counts

def tf_idf(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_counts = tfidf.fit_transform(X_train)
    X_test_counts = tfidf.transform(X_test)
    return X_test_counts, X_train_counts


In [12]:
# Lemmatizing will or won't happen. Two functions, one nested inside the other.
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(w) for w in text]

def lemmatize():
    data['lemm'] = data['tokens'].apply(lemmatize_text)
    data['lemm'] = data['lemm'].apply(lambda x: ' '.join(x))

In [13]:
# SMOTE either will or will not run.
def smote(X_train_counts, y_train):
    smote = SMOTE()
    X_train_counts, y_train = smote.fit_sample(X_train_counts, y_train)
    return X_train_counts, y_train

# Train Test Split. The col variable is important and will be different depending on whether we lemmatize.
def TTS(col):
    X_train, X_test, y_train, y_test = train_test_split(data[col], data['feelings'])
    return X_train, X_test, y_train, y_test

In [14]:
# Logistic Regression
def logreg(X_train_counts, y_train, X_test_counts):
    clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', 
                         multi_class='multinomial', n_jobs=-1, random_state=40)
    clf.fit(X_train_counts, y_train)
    y_predicted_counts = clf.predict(X_test_counts)
    return y_predicted_counts

In [15]:
# Random Forest
def rf(X_train_counts, y_train, X_test_counts):
    rf = RandomForestClassifier()
    rf.fit(X_train_counts, y_train)
    y_predicted_counts = rf.predict(X_test_counts)
    return y_predicted_counts

In [16]:
# Multinomial Naive Bayes
def multiNB(X_train_counts, y_train, X_test_counts):
    nb = MultinomialNB()
    nb.fit(X_train_counts, y_train)
    y_predicted_counts = nb.predict(X_test_counts)
    return y_predicted_counts

In [17]:
# Metrics that will work with any model selected, along with a confusion matrix.
def classify(y_test, y_predicted_counts):
    print('\n\nClassification Report - TEST')
    print('--------------------------------------------------------------------------')
    print(classification_report(y_test, y_predicted_counts))
    print('--------------------------------------------------------------------------')
    print('Confusion Matrix - TEST')
    print('--------------------------------------------------------------------------')
    print(pd.crosstab(y_test, y_predicted_counts, rownames=['True'], colnames=['Predicted'], margins=True))
    print('--------------------------------------------------------------------------')

# Our Modeling System!

### We made the choice to create an interactive modeling system. The user will be asked to supply inputs and choose his own path to model the data. First, we choose to Count Vectorize or TF-IDF, whether or not to Lemmatize or to SMOTE. There are 8 possible combinations of choices, each accounted for. We've even processed every combination beforehand to have recommended modeling depending on the user's choices. Modeling is fun again!

In [18]:
autoclean = True

In [19]:
def user_models(data):
    # Canned response for every response that isn't '1' or '2.'
    jerk = "Don't waste my time. Try again wiseguy."

    # Prepare some variables.
    col = None
    model = None
    lemm = None
    smt = None
    reco = None
    picks = []
    choices = [logreg, rf, multiNB]
    t = None
    proceed = 1

    # User choices will influence recommended model selection.
    log = [[1, 2, 2], [2,2,2]]
    forest = [[1,1,2],[2,1,2]]
    NB = [[1,2,1],[2,2,1],[1,1,1,],[2,1,1]]
    
    ################ For Advanced Interface #####################
    print('THE DATA SCIENCE PROCESS! \nLET\S MODEL!\n')
    print("Let's model our data! Where do you want to begin?\n\nType '1' start modeling our data.\nType '2' if you want to be lazy.\n")
    lazy = input()
    if lazy not in ['1','2']:
        print(jerk)
    elif lazy == '2':
        print("\nYeah, I get it. I'll do all the work.\n")
        lemmatize()
        t = 2
        col = 'lemm'
        X_train, X_test, y_train, y_test = TTS(col)
        X_test_counts, X_train_counts = tf_idf(X_train, X_test)
        X_train_counts, y_train = smote(X_train_counts, y_train)
        mod = choices[t]
        print('-------------------------------------------------------------------------------------------------------------')
        print('-------------------------------------------------------------------------------------------------------------')
        print('Our best model uses TF-IDF, SMOTE, Lemmatization, and is created using Multinomial Naive Bayes, you lazy bum.')
        print('We chose this model because of its high recall combined with a high overall accuracy.')
        print('-------------------------------------------------------------------------------------------------------------')
        print('-------------------------------------------------------------------------------------------------------------')
        y_predicted_counts = mod(X_train_counts, y_train, X_test_counts)
        classify(y_test, y_predicted_counts)
        return

    # Here we ask three questions, leading to 8 possible combinations. The answers affect what functions are run, and also are saved in a list and used to compare for model selection.
    while proceed == 1:
        print('How would you like to analyze the data?\n\nType "1" to Count Vectorize or "2" to implement TF-IDF.\nIf you don\'t know the difference, type "3."\n\n')
        model = input()
        if int(model) < 3:
            picks.append(int(model))
        if model not in ['1', '2', '3']:
            return print(jerk)
        if model == '3':
            print('\nCount Vectorizing creates a matrix consisting of a count of individual tokens across a single data entry. \nTerm Frequency-Inverse Document Frequency creates a matrix but assigns each token a weight according to its importance weighted by its frequency across documents.\n')
        else:
            proceed = 0

    while proceed == 0:
        print('Would you like to lemmatize?\n\nType "1" for Yes or "2" for No.\nIf you don\'t know the difference, type "3"\n\n')
        lemm  = input()
        if int(lemm) < 3:
            picks.append(int(lemm))
        if lemm not in ['1', '2','3']:
            return print(jerk)
        elif lemm == '3':
            print('\nLemmatization transforms a word into its root form: i.e. running, ran, run all become the word run.\n')
        else:
            proceed = 1

    while proceed == 1:
        print('Would you like to SMOTE?\n\nType "1" for Yes or "2" for No.\nIf you don\'t know the difference, type "3"\n\n')
        smt = input()
        if int(smt) < 3:
            picks.append(int(smt))
        if smt not in ['1','2','3']:
            return print(jerk)
        if smt == '3':
            print('\nSMOTE is used to address class imbalance. It creates artificial data in the test dataset similar to the qualitites of the minority class.\n')
        else:
            proceed = 0

    # Lemmatize function runs or not. The function requires the data to be in list form. The function will remove the list form.
    # If not lemmatized, data['tokens'] column is taken out of list form.
    if lemm == '1':
        col = 'lemm'
        lemmatize()
        print('Lemmatized changed this many rows:', (len(data) - (sum(data['tokens'] == data['lemm']))),'\n')
    elif lemm == '2':
        col = 'tokens'
        data['tokens'] = data['tokens'].apply(lambda x: ' '.join(x))

    # Here we compare the user's choices to our previously established lists. They are organized and hand picked from personal results to produce the best results.
    if picks in log:
        t = 0
    elif picks in forest:
        t = 1
    elif picks in NB:
        t = 2
    print(picks)

    # Train Test Split. Our column choice is based on lemmatization choice.
    X_train, X_test, y_train, y_test = TTS(col)

    # Count Vectorize or TF-IDF.
    if model == '1':
        X_train_counts, X_test_counts = CV(X_train, X_test)
    elif model == '2':
        X_test_counts, X_train_counts = tf_idf(X_train, X_test)

    # SMOTE or not.
    if smt == '1':
        X_train_counts, y_train = smote(X_train_counts, y_train)
 
    # Now that choices are made and we've picked our recommendation, we give the user a choice. They have control of the entire modeling process.
    print('Would you like to pick your model or use our recomendation?\nType "1" to choose or "2" to let us.\n')
    reco = input()
    if reco not in ['1','2']:
        return print(jerk)
    elif reco == '1':
        print('\n\n"1" for Logistic Regression\n"2" for Random Forest\n"3" for Multinomial Naive Bayes')
        t = (int(input()) - 1)

    # t variable is either recommended by us or chosen by user. It picks an option from a list of models.
    mod = choices[t]
    if mod == logreg:
        print('\n\nLogistic Regression Report')
    elif mod == rf:
        print('\n\nRandom Forest Report')
    elif mod == multiNB:
        print('\n\nMultinomial Naive Bayes')

    # We run the chosen model and print out metrics for evaluation, along with a confusion matrix.
    y_predicted_counts = mod(X_train_counts, y_train, X_test_counts)
    classify(y_test, y_predicted_counts)


In [20]:
user_models(data)

THE DATA SCIENCE PROCESS! 
LET\S MODEL!

Let's model our data! Where do you want to begin?

Type '1' start modeling our data.
Type '2' if you want to be lazy.

Don't waste my time. Try again wiseguy.
How would you like to analyze the data?

Type "1" to Count Vectorize or "2" to implement TF-IDF.
If you don't know the difference, type "3."




ValueError: invalid literal for int() with base 10: ''

# Our Favorite Models

### We took an extremely scientific approach to analyzing our models: Predict, Experiment, Observe, Record the Results. With 3 possible models, a choice of processing the data, a choice to lemmatize and a choice to SMOTE, we end up with 24 (3 x 2 x 2 x 2) unique models. Observing the dataset to be imbalanced in favor of positive sentiments, we decided to focus primarily on the recall of the minority class. In practical terms, we are attempting to identify as many unhappy users of Google and Apple products as we can and prefer to err on the side of classifying extra unhappy users as oppoosed to missing them. 

### We run all 24 models individually and record their most important metrics: Accuracy, Precision, Recall, and F1 Score. Looking at the spreadsheet of results, a few things jump out at us. Excepting several outliers of lower recall, most of our recall scores are in the 0.40 - 0.60 range. Thus, we decided to factor in an overall metric, Accuracy when making our decisions. In each combination we use these metrics to choose our best model. In many categories it was a close decision and could have easily been justified to go in another direction.

![spreadsheet](spreadsheetTop.png)

### Our favorite model overall used Term Frequency–Inverse Document Frequency to transform our data, Lemmatization, Smoting, and used Multinomial Naive Bayes to model it. We used all the bells and whistles for our best result. It produced our third highest recall score, and a higher overall accuracy than the two models with higher recall. Again, it was a tough call and other model choices could have been justified just as well.

![spreadsheet](spreadBottom.png)

### As highlighted above, recall was high in this model. The confusion matrix shows 86 correctly identified unhappy users, 51 unhappy users missed, and 126 happy users identified as unhappy. 51 missed users is tied with one other model as the lowest result we found. 126 happy users misidentified as unhappy is high, higher than we would like, but with over 600 correctly identified and a low number of unhappy users missed, we have hit our project target. The goal was to find unhappy users and overall accuracy and we are willing to accept a higher number of incorrectly identified users as long as they are happy. In this scenario, we would use these results to make unhappy users' experience better and the happy users may become even more positive.

In [None]:
# data.to_pickle("./exp")

In [None]:
print(data)