# Sentiment Analysis - Emotion Detection

In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib as plt
import graphviz
from pprint import pprint
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer, TweetTokenizer
from nltk import pos_tag

from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.svm import SVC, LinearSVC
from sklearn.naive_bayes import MultinomialNB, BernoulliNB

%matplotlib inline

## Import data

**Source:** Crowdflower https://www.crowdflower.com/data-for-everyone (Sentiment Analysis: Emotion in Text)

**Download:** https://www.crowdflower.com/wp-content/uploads/2016/07/text_emotion.csv

In [4]:
data_path = 'data/text_emotion.csv'

In [5]:
df_init = pd.read_csv(data_path)

## Explore data

In [None]:
df_init

In [782]:
df_init.groupby('sentiment')['content'].count().sort_values()

sentiment
anger          110
boredom        179
enthusiasm     759
empty          827
hate          1323
relief        1526
fun           1776
surprise      2187
love          3842
sadness       5165
happiness     5209
worry         8459
neutral       8638
Name: content, dtype: int64

In [783]:
emotions_set = set(df_init['sentiment'])

In [784]:
emotions_set

{'anger',
 'boredom',
 'empty',
 'enthusiasm',
 'fun',
 'happiness',
 'hate',
 'love',
 'neutral',
 'relief',
 'sadness',
 'surprise',
 'worry'}

In [785]:
keep_emotions_list = [
    'anger',
#     'boredom',
#     'empty',
#     'enthusiasm',
#     'fun',
    'happiness',
#     'hate',
#     'love',
#     'neutral',
#     'relief',
    'sadness',
    'surprise',
#     'worry'
]

In [786]:
df = df_init[df_init['sentiment'].isin(keep_emotions_list)]

In [None]:
df 

## Get list of texts

In [788]:
text_list = df.content.tolist()

In [None]:
text_list

## Get list of labels / emotions

In [790]:
labels_list = df.sentiment.tolist()

In [None]:
labels_list

## Model of Emotion (Plutchik)

In [792]:
emotions_list_plutchik = ['anger', 'anticipation', 'digust', 'fear', 'joy', 'sadness', 'surprise', 'trust']

## Handle negations

### Define negations

In [793]:
negations_list = ['never','no','nothing','nowhere','noone','none','not','havent','hasnt','hadnt','cant',
                  'couldnt','shouldnt','wont','wouldnt','dont','doesnt','didnt','isnt','arent','aint',
                 'wasnt']

negations_list.extend(["haven't","hasn't","hadn't","can't","couldn't","shouldn't","won't","wouldn't",
                       "don't","doesn't","didn't","isn't","aren't",
                        "wasn't"])

### Append _NEG to every word immediately after a negation

In [794]:
def handle_negations(tokens):
    new_tokens = []
    prev_neg = False
    for token in tokens:
        current_token = token

        if prev_neg == True:
            current_token += "_NEG"
            prev_neg = False

        if token in negations_list:
            prev_neg = True

        new_tokens.append(current_token)
    return new_tokens

### Part-Of Speech tagging

In [835]:
# universal_tags_keep = ['VERB', 'NOUN', 'ADJ']
# VERB (verbs), NOUN (nouns), PRON (pronouns), ADJ (adjectives), ADV (adverbs), ADP (adpositions ), CONJ (conjunctions), DET (determiners), NUM (cardinal numbers), PRT (particles ), . (punctuation) and X (other).

## Tokenization

In [836]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
def custom_tokenizer(text):
    # Tokenize text
    tokens = tokenizer.tokenize(text)
    
    # Handle negations
    tokens_neg = handle_negations(tokens)
    
    # POS tagging
    tokens_tagged = pos_tag(tokens, tagset='universal')
    
    tokens = [ (tokens_neg[i],tagged) for token in enumerate(tokens)]
    return tokens

## Preprocessing

In [834]:
def custom_preprocessor(text):
    # Remove URLs
    text = re.sub(r'http\S+|https\S+', '', text)
    return text

## Feature Extraction

### Options

In [814]:
vectorizer_options = dict(
#     min_df = 0.1,
#     max_df = 0.7,
#     stop_words = 'english',
    
    lowercase = False,
    tokenizer = custom_tokenizer,
    preprocessor = custom_preprocessor,
    ngram_range = (1, 2),
#     binary = True
)

# vectorizer = CountVectorizer(**vectorizer_options)
vectorizer = TfidfVectorizer(**vectorizer_options)

### Create feature vector

In [812]:
def create_featurevector(documents):
    tfidf_matrix = vectorizer.fit_transform(documents)
    return tfidf_matrix, vectorizer

In [798]:
train, vectorizer = create_featurevector(text_list)

In [799]:
train.shape

(12671, 104229)

In [None]:
vectorizer.get_feature_names()

## Encode Labels / Emotions

In [801]:
lbl = LabelEncoder()

In [802]:
train_target = lbl.fit_transform(labels_list)

In [803]:
target_names = lbl.classes_.tolist()

## Supervised learning - Classification

## Naive Bayes

### Multinomial Naive Bayes

In [804]:
clf_mnb = MultinomialNB()

### Bernoulli Naive Bayes

In [805]:
clf_bnb = BernoulliNB()

## Support Vector Machine (SVM)

### Linear kernel

In [806]:
clf_lsvc = LinearSVC()

## Model evaluation

### Cross-validation

#### Best Results
- 0.54 SVM bigrams TFIDF ('anger', 'happiness', 'sadness', 'surprise', 'neutral')
- 0.66 SVM bigrams TFIDF ('anger', 'happiness', 'sadness', 'surprise')
- 0.67 SVM bigrams TFIDF ('anger', 'happiness', 'sadness', 'surprise') + _NEG

####  Mean score and 95% confidence interval

In [807]:
scores = cross_val_score(clf_lsvc, train, train_target, cv=10, scoring='accuracy')
print scores
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

[ 0.58596215  0.60015773  0.69873817  0.70268139  0.71135647  0.71428571
  0.71507498  0.70537125  0.65323855  0.5715415 ]
Accuracy: 0.67 (+/- 0.11)


#### Classification report

In [808]:
y_pred = cross_val_predict(clf_lsvc, train, train_target, cv=10)

In [829]:
print(classification_report(train_target, y_pred, target_names=target_names))

             precision    recall  f1-score   support

      anger       0.00      0.00      0.00       110
  happiness       0.68      0.78      0.73      5209
    sadness       0.68      0.78      0.73      5165
   surprise       0.44      0.15      0.23      2187

avg / total       0.63      0.67      0.63     12671

