# Project Part 5.1: Data Preparaton - Sentiment Analysis and Understanding Scores

In [9]:
import json
import os
import pandas as pd
import numpy as np
import os
import re
import math
import string
import requests
import json
from itertools import product
from inspect import getsourcefile
from io import open

os.chdir('C:\\Users\\wmj51\\Desktop\\python')
pd.set_option('display.max_colwidth', -1)

## VaderSentiment (updated)

In [None]:
##Constants##

# (empirically derived mean sentiment intensity rating increase for booster words)
B_INCR = 0.293
B_DECR = -0.293

# booster/dampener 'intensifiers' or 'degree adverbs'
# http://en.wiktionary.org/wiki/Category:English_degree_adverbs

BOOSTER_DICT = \
    {"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR,
     "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR,
     "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR,
     "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR,
     "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR,
     "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR,
     "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR,
     "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR,
     "so": B_INCR, "substantially": B_INCR,
     "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR,
     "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR,
     "very": B_INCR,
     "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR,
     "kinda": B_DECR, "kindof": B_DECR, 
     "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR,
     "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR,
     "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR}


# (empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a word)
C_INCR = 0.733
N_SCALAR = -0.74

# for removing punctuation
REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation))

PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"",
             "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"]

NEGATE = \
    ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt",
     "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't",
     "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither",
     "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't",
     "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", 
     "no","nobody", "nomore",
     "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", 
     "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't",
     "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite",
     # add more negates
     "can't've", "couldn't've", "hadn't've", "mayn't", "maynot", "mightn't've", "mustn't've",
     "needn't've", "sha'n't", "shan't've", "shouldn't've", "won't've", "wouldn't've", 
     "idk", "must'nt", "need'nt", "noes", "nobod", "np", "ought'nt", "should'nt"]
    
# check for sentiment laden idioms that do not contain lexicon words (future work, not yet implemented)
SENTIMENT_LADEN_IDIOMS = {"cut the mustard": 2, "hand to mouth": -2,
                          "back handed": -2, "blow smoke": -2, "blowing smoke": -2,
                          "upper hand": 1, "break a leg": 2, "no problem": 2,
                          "cooking with gas": 2, "in the black": 2, "in the red": -2,
                          "on the ball": 2, "under the weather": -2}

# check for special case idioms containing lexicon words
SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2, 
                       "kiss of death": -1.5}


# #Static methods# #

def negated(input_words, include_nt=True):
    """
    Determine if input contains negation words
    """
    input_words = [str(w).lower() for w in input_words]
    neg_words = []
    neg_words.extend(NEGATE)
    for word in neg_words:
        if word in input_words:
            return True
    if include_nt:
        for word in input_words:
            if "n't" in word:
                return True
    if "least" in input_words:
        i = input_words.index("least")
        if i > 0 and input_words[i - 1] != "at":
            return True
    return False


def normalize(score, alpha=15):
    """
    Normalize the score to be between -1 and 1 using an alpha that
    approximates the max expected value
    """
    norm_score = score / math.sqrt((score * score) + alpha)
    if norm_score < -1.0:
        return -1.0
    elif norm_score > 1.0:
        return 1.0
    else:
        return norm_score
    
    
def allcap_differential(words):
    """
    Check whether just some words in the input are ALL CAPS
    :param list words: The words to inspect
    :returns: `True` if some but not all items in `words` are ALL CAPS
    """
    is_different = False
    allcap_words = 0
    for word in words:
        if word.isupper():
            allcap_words += 1
    cap_differential = len(words) - allcap_words
    if 0 < cap_differential < len(words):
        is_different = True
    return is_different


def scalar_inc_dec(word, valence, is_cap_diff):
    """
    Check if the preceding words increase, decrease, or negate/nullify the
    valence
    """
    scalar = 0.0
    word_lower = word.lower()
    if word_lower in BOOSTER_DICT:
        scalar = BOOSTER_DICT[word_lower]
        if valence < 0:
            scalar *= -1
        # check if booster/dampener word is in ALLCAPS (while others aren't)
        if word.isupper() and is_cap_diff:
            if valence > 0:
                scalar += C_INCR
            else:
                scalar -= C_INCR
    return scalar


class SentiText(object):
    """
    Identify sentiment-relevant string-level properties of input text.
    """

    def __init__(self, text):
        if not isinstance(text, str):
            text = str(text).encode('utf-8')
        self.text = text
        self.words_and_emoticons = self._words_and_emoticons()
        # doesn't separate words from\
        # adjacent punctuation (keeps emoticons & contractions)
        self.is_cap_diff = allcap_differential(self.words_and_emoticons)

    def _words_plus_punc(self):
        """
        Returns mapping of form:
        {
            'cat,': 'cat',
            ',cat': 'cat',
        }
        """
        no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text)
        # removes punctuation (but loses emoticons & contractions)
        words_only = no_punc_text.split()
        # remove singletons
        words_only = set(w for w in words_only if len(w) > 1)
        # the product gives ('cat', ',') and (',', 'cat')
        punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)}
        punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)}
        words_punc_dict = punc_before
        words_punc_dict.update(punc_after)
        return words_punc_dict

    def _words_and_emoticons(self):
        """
        Removes leading and trailing puncutation
        Leaves contractions and most emoticons
            Does not preserve punc-plus-letter emoticons (e.g. :D)
        """
        wes = self.text.split()
        words_punc_dict = self._words_plus_punc()
        wes = [we for we in wes if len(we) > 1]
        for i, we in enumerate(wes):
            if we in words_punc_dict:
                wes[i] = words_punc_dict[we]
        return wes


class SentimentIntensityAnalyzer(object):
    """
    Give a sentiment intensity score to sentences.
    """

    def __init__(self, lexicon_file="vader_lexicon.txt", emoji_lexicon="emoji_utf8_lexicon.txt"):
        _this_module_file_path_ = os.path.abspath(getsourcefile(lambda: 0))
        lexicon_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), lexicon_file)
        with open(lexicon_full_filepath, encoding='utf-8') as f:
            self.lexicon_full_filepath = f.read()
        self.lexicon = self.make_lex_dict()

        emoji_full_filepath = os.path.join(os.path.dirname(_this_module_file_path_), emoji_lexicon)
        with open(emoji_full_filepath, encoding='utf-8') as f:
            self.emoji_full_filepath = f.read()
        self.emojis = self.make_emoji_dict()

    def make_lex_dict(self):
        """
        Convert lexicon file to a dictionary
        """
        lex_dict = {}
        for line in self.lexicon_full_filepath.split('\n'):
            (word, measure) = line.strip().split('\t')[0:2]
            lex_dict[word] = float(measure)
        return lex_dict

    def make_emoji_dict(self):
        """
        Convert emoji lexicon file to a dictionary
        """
        emoji_dict = {}
        for line in self.emoji_full_filepath.split('\n'):
            (emoji, description) = line.strip().split('\t')[0:2]
            emoji_dict[emoji] = description
        return emoji_dict

    def polarity_scores(self, text):
        """
        Return a float for sentiment strength based on the input text.
        Positive values are positive valence, negative value are negative
        valence.
        """
        # convert emojis to their textual descriptions
        text_token_list = text.split()
        text_no_emoji_lst = []
        for token in text_token_list:
            if token in self.emojis:
                # get the textual description
                description = self.emojis[token]
                text_no_emoji_lst.append(description)
            else:
                text_no_emoji_lst.append(token)
        text = " ".join(x for x in text_no_emoji_lst)

        sentitext = SentiText(text)

        sentiments = []
        words_and_emoticons = sentitext.words_and_emoticons
        for item in words_and_emoticons:
            valence = 0
            i = words_and_emoticons.index(item)
            # check for vader_lexicon words that may be used as modifiers or negations
            if item.lower() in BOOSTER_DICT:
                sentiments.append(valence)
                continue
            if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and
                    words_and_emoticons[i + 1].lower() == "of"):
                sentiments.append(valence)
                continue

            sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

        sentiments = self._but_check(words_and_emoticons, sentiments)

        valence_dict = self.score_valence(sentiments, text)

        return valence_dict

    def sentiment_valence(self, valence, sentitext, item, i, sentiments):
        is_cap_diff = sentitext.is_cap_diff
        words_and_emoticons = sentitext.words_and_emoticons
        item_lowercase = item.lower()
        if item_lowercase in self.lexicon:
            # get the sentiment valence
            valence = self.lexicon[item_lowercase]
            # check if sentiment laden word is in ALL CAPS (while others aren't)
            if item.isupper() and is_cap_diff:
                if valence > 0:
                    valence += C_INCR
                else:
                    valence -= C_INCR

            for start_i in range(0, 3):
                # dampen the scalar modifier of preceding words and emoticons
                # (excluding the ones that immediately preceed the item) based
                # on their distance from the current item.
                if i > start_i and words_and_emoticons[i - (start_i + 1)].lower() not in self.lexicon:
                    s = scalar_inc_dec(words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff)
                    if start_i == 1 and s != 0:
                        s = s * 0.95
                    if start_i == 2 and s != 0:
                        s = s * 0.9
                    valence = valence + s
                    valence = self._negation_check(valence, words_and_emoticons, start_i, i)
                    if start_i == 2:
                        valence = self._special_idioms_check(valence, words_and_emoticons, i)

            valence = self._least_check(valence, words_and_emoticons, i)
        sentiments.append(valence)
        return sentiments

    def _least_check(self, valence, words_and_emoticons, i):
        # check for negation case using "least"
        if i > 1 and words_and_emoticons[i - 1].lower() not in self.lexicon \
                and words_and_emoticons[i - 1].lower() == "least":
            if words_and_emoticons[i - 2].lower() != "at" and words_and_emoticons[i - 2].lower() != "very":
                valence = valence * N_SCALAR
        elif i > 0 and words_and_emoticons[i - 1].lower() not in self.lexicon \
                and words_and_emoticons[i - 1].lower() == "least":
            valence = valence * N_SCALAR
        return valence

    @staticmethod
    def _but_check(words_and_emoticons, sentiments):
        # check for modification in sentiment due to contrastive conjunction 'but'
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        if 'but' in words_and_emoticons_lower:
            bi = words_and_emoticons_lower.index('but')
            for sentiment in sentiments:
                si = sentiments.index(sentiment)
                if si < bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment * 0.5)
                elif si > bi:
                    sentiments.pop(si)
                    sentiments.insert(si, sentiment * 1.5)
        return sentiments

    @staticmethod
    def _special_idioms_check(valence, words_and_emoticons, i):
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        onezero = "{0} {1}".format(words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])

        twoonezero = "{0} {1} {2}".format(words_and_emoticons_lower[i - 2],
                                          words_and_emoticons_lower[i - 1], words_and_emoticons_lower[i])

        twoone = "{0} {1}".format(words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])

        threetwoone = "{0} {1} {2}".format(words_and_emoticons_lower[i - 3],
                                           words_and_emoticons_lower[i - 2], words_and_emoticons_lower[i - 1])

        threetwo = "{0} {1}".format(words_and_emoticons_lower[i - 3], words_and_emoticons_lower[i - 2])

        sequences = [onezero, twoonezero, twoone, threetwoone, threetwo]

        for seq in sequences:
            if seq in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[seq]
                break

        if len(words_and_emoticons_lower) - 1 > i:
            zeroone = "{0} {1}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1])
            if zeroone in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[zeroone]
        if len(words_and_emoticons_lower) - 1 > i + 1:
            zeroonetwo = "{0} {1} {2}".format(words_and_emoticons_lower[i], words_and_emoticons_lower[i + 1],
                                              words_and_emoticons_lower[i + 2])
            if zeroonetwo in SPECIAL_CASE_IDIOMS:
                valence = SPECIAL_CASE_IDIOMS[zeroonetwo]

        # check for booster/dampener bi-grams such as 'sort of' or 'kind of'
        n_grams = [threetwoone, threetwo, twoone]
        for n_gram in n_grams:
            if n_gram in BOOSTER_DICT:
                valence = valence + BOOSTER_DICT[n_gram]
        return valence

    @staticmethod
    def _sentiment_laden_idioms_check(valence, senti_text_lower):
        # Future Work
        # check for sentiment laden idioms that don't contain a lexicon word
        idioms_valences = []
        for idiom in SENTIMENT_LADEN_IDIOMS:
            if idiom in senti_text_lower:
                print(idiom, senti_text_lower)
                valence = SENTIMENT_LADEN_IDIOMS[idiom]
                idioms_valences.append(valence)
        if len(idioms_valences) > 0:
            valence = sum(idioms_valences) / float(len(idioms_valences))
        return valence

    @staticmethod
    def _negation_check(valence, words_and_emoticons, start_i, i):
        words_and_emoticons_lower = [str(w).lower() for w in words_and_emoticons]
        if start_i == 0:
            if negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 1 word preceding lexicon word (w/o stopwords)
                valence = valence * N_SCALAR
        if start_i == 1:
            if words_and_emoticons_lower[i - 2] == "never" and \
                    (words_and_emoticons_lower[i - 1] == "so" or
                     words_and_emoticons_lower[i - 1] == "this"):
                valence = valence * 1.25
            elif words_and_emoticons_lower[i - 2] == "without" and \
                    words_and_emoticons_lower[i - 1] == "doubt":
                valence = valence
            elif negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 2 words preceding the lexicon word position
                valence = valence * N_SCALAR
        if start_i == 2:
            if words_and_emoticons_lower[i - 3] == "never" and \
                    (words_and_emoticons_lower[i - 2] == "so" or words_and_emoticons_lower[i - 2] == "this") or \
                    (words_and_emoticons_lower[i - 1] == "so" or words_and_emoticons_lower[i - 1] == "this"):
                valence = valence * 1.25
            elif words_and_emoticons_lower[i - 3] == "without" and \
                    (words_and_emoticons_lower[i - 2] == "doubt" or words_and_emoticons_lower[i - 1] == "doubt"):
                valence = valence
            elif negated([words_and_emoticons_lower[i - (start_i + 1)]]):  # 3 words preceding the lexicon word position
                valence = valence * N_SCALAR
        return valence

    def _punctuation_emphasis(self, text):
        # add emphasis from exclamation points and question marks
        ep_amplifier = self._amplify_ep(text)
        qm_amplifier = self._amplify_qm(text)
        punct_emph_amplifier = ep_amplifier + qm_amplifier
        return punct_emph_amplifier

    @staticmethod
    def _amplify_ep(text):
        # check for added emphasis resulting from exclamation points (up to 4 of them)
        ep_count = text.count("!")
        if ep_count > 4:
            ep_count = 4
        # (empirically derived mean sentiment intensity rating increase for
        # exclamation points)
        ep_amplifier = ep_count * 0.292
        return ep_amplifier

    @staticmethod
    def _amplify_qm(text):
        # check for added emphasis resulting from question marks (2 or 3+)
        qm_count = text.count("?")
        qm_amplifier = 0
        if qm_count > 1:
            if qm_count <= 3:
                # (empirically derived mean sentiment intensity rating increase for
                # question marks)
                qm_amplifier = qm_count * 0.18
            else:
                qm_amplifier = 0.96
        return qm_amplifier

    @staticmethod
    def _sift_sentiment_scores(sentiments):
        # want separate positive versus negative sentiment scores
        pos_sum = 0.0
        neg_sum = 0.0
        neu_count = 0
        for sentiment_score in sentiments:
            if sentiment_score > 0:
                pos_sum += (float(sentiment_score) + 1)  # compensates for neutral words that are counted as 1
            if sentiment_score < 0:
                neg_sum += (float(sentiment_score) - 1)  # when used with math.fabs(), compensates for neutrals
            if sentiment_score == 0:
                neu_count += 1
        return pos_sum, neg_sum, neu_count

    def score_valence(self, sentiments, text):
        if sentiments:
            sum_s = float(sum(sentiments))
            # compute and add emphasis from punctuation in text
            punct_emph_amplifier = self._punctuation_emphasis(text)
            if sum_s > 0:
                sum_s += punct_emph_amplifier
            elif sum_s < 0:
                sum_s -= punct_emph_amplifier

            compound = normalize(sum_s)
            # discriminate between positive, negative and neutral sentiment scores
            pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)

            if pos_sum > math.fabs(neg_sum):
                pos_sum += punct_emph_amplifier
            elif pos_sum < math.fabs(neg_sum):
                neg_sum -= punct_emph_amplifier

            total = pos_sum + math.fabs(neg_sum) + neu_count
            pos = math.fabs(pos_sum / total)
            neg = math.fabs(neg_sum / total)
            neu = math.fabs(neu_count / total)

        else:
            compound = 0.0
            pos = 0.0
            neg = 0.0
            neu = 0.0

        sentiment_dict = \
            {"neg": round(neg, 3),
             "neu": round(neu, 3),
             "pos": round(pos, 3),
             "compound": round(compound, 4)}

        return sentiment_dict

analyzer = SentimentIntensityAnalyzer()

new_words = {
    'fabulously': 2.4,
    'amazingly': 2.8,
    'fairer': 1.3,
    'fairest': 1.3,
    'fantastically': 2.0,
    'fantasy': 2.6,
    'kindn': 2.2,
    'yum': 2.4,
    'benefitt': 1.6,
    'bestest': 3.2,
    'bestie': 3.2,
    'besties': 3.2,
    'finer': 0.8,
    'finest': 0.8,
    'fortunately': 1.9,
    'free-think': 1.0,
    'freethink': 1.0,
    'lovelier': 2.8,
    'loveliest': 2.8,
    'grande': 1.0,
    'greatness': 3.2,
    'okey': 0.9,
    'peacekeep': 1.6,
    'peacemak': 2.0,
    'sweeter': 2.0,
    'sweetest': 2.0,
    'tender': 0.5,
    'thanking': 1.8,
    'thanx': 1.9,
    'thnx': 1.9,
    'heartwarm': 2.1,
    "hero's": 2.6,
    'enemie': -2.2,
    'ineffect': -1.3,
    'poorly': -2.1,
    'poorness': -2.1,
    'uncontrol': -1.5,
    'asshole': -2.8,
    'fuckh': -2.5,
    'fuckin': -2.5,
    'fucktwat': -3.1,
    'fuckwad': -3.1,
    'messier': -1.5,
    'scarier': -2.2,
    'scariest': -2.2,
    'deceptive':-1.9,
    'defend': -0.2,
    'harmful': -2.6,
    'messier': -1.5,
    'messiest': -1.5,
    'strangest': -0.8,
    'wrongdoing': -1.9,
    'wrongful': -1.9,
    'wrongly': -1.9,
    'wrongness;': -1.9,
    'wrongs': -1.9,
    'heartbroke': -2.7,
    'petrif': -0.3,
    'traged': -2.7
}

analyzer.lexicon.update(new_words)

## Text Cleaning (updated)

In [None]:
import string
import HTMLParser
html_parser = HTMLParser.HTMLParser()
import re
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup

split_adhere_dic = { # split
                    "cause": "because", "could've": "could have", 
                    "he'd": "he would", "he'd've": "he would have", "he'll": "he will", 
                    "he'll've": "he will have", "he's": "he is", "how'd": "how did", 
                    "how'd'y": "how do you", "how'll": "how will", "how's": "how is", 
                    "I'd": "I would", "I'd've": "I would have", "I'll": "I will", 
                    "I'll've": "I will have","I'm": "I am", "I've": "I have", 
                    "i'd": "i would", "i'd've": "i would have", "i'll": "i will", 
                    "i'll've": "i will have","i'm": "i am", "i've": "i have",
                    "it'd": "it would", "it'd've": "it would have", "it'll": "it will", 
                    "it'll've": "it will have","it's": "it is", 
                    "let's": "let us", "ma'am": "madam", "might've": "might have",
                    "o'clock": "of the clock", 
                    "she'd": "she would", "she'd've": "she would have", "she'll": "she will", 
                    "she'll've": "she will have", "she's": "she is", "should've": "should have",
                    "so've": "so have","so's": "so as", 
                    "this's": "this is",
                    "that'd": "that would", "that'd've": "that would have","that's": "that is", 
                    "there'd": "there would", "there'd've": "there would have","there's": "there is", 
                    "here's": "here is",
                    "they'd": "they would", "they'd've": "they would have", "they'll": "they will", 
                    "they'll've": "they will have", "they're": "they are", "they've": "they have",
                    "to've": "to have", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", 
                    "we'll've": "we will have", "we're": "we are", "we've": "we have",
                    "what'll": "what will", "what'll've": "what will have", "what're": "what are", 
                    "what's": "what is", "what've": "what have", "when's": "when is", 
                    "when've": "when have", "where'd": "where did", "where's": "where is", 
                    "where've": "where have", "who'll": "who will", "who'll've": "who will have", 
                    "who's": "who is", "who've": "who have", "why's": "why is", 
                    "why've": "why have", "will've": "will have", "would've": "would have",
                    "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
                    "y'all're": "you all are","y'all've": "you all have", "you'd": "you would", 
                    "you'd've": "you would have", "you'll": "you will", 
                    "you'll've": "you will have", "you're": "you are", "you've": "you have",
                     # adhere 
                    "is not": "ain't", "are not": "aren't","cannot": "can't", "can not": "can't", 
                    "cannot have": "can't've", "could not": "couldn't", "could not have": "couldn't've","did not": "didn't", 
                    "does not": "doesn't", "do not": "don't", "had not": "hadn't",  
                    "had not have": "hadn't've", "has not": "hasn't", "have not": "haven't", "is not": "isn't", 
                    "may not": "mayn't", "might not": "mightn't","might not have": "mightn't've", 
                    "must have": "must've", "must not": "mustn't", "must not have": "mustn't've", 
                    "need not": "needn't", "need not have": "needn't've",
                    "ought not": "oughtn't", "ought not have": "oughtn't've", "shall not": "shan't",
                    "shall not": "sha'n't", "shall not have": "shan't've", 
                    "should not": "shouldn't", "should not have": "shouldn't've", "was not": "wasn't", "were not": "weren't",
                    "will not": "won't", "will not have": "won't've",  "would not": "wouldn't", 
                    "would not have": "wouldn't've", 
                    "no prblme": "noproblem", "no prob": "noprob", "no problems": "noproblmes", "no probs": "noprobs",
                    "no worry": "noworry", "no worries": "noworries", "good bey": "goodbey", "long time no see": "longtimenosee", 
                    "nothing but": "nothingbut", "no more": "nomore",
                    "kind of": "kindof", "sort of": "sortof"}

pat1 = r'@[\w_]+' # @-mention
pat2 = r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+' # URLs
pat5 = r'www.[^ ]+' # additions to URLs, texts with 'www..'
combined_pat = r'|'.join((pat1, pat2, pat5))

split_pattern = re.compile(r'\b(' + '|'.join(split_adhere_dic.keys()) + r')\b')

def tweet_cleaner(demo):
    soup = BeautifulSoup(demo, 'lxml') # HTML
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    split_handled = split_pattern.sub(lambda x: split_adhere_dic[x.group()], stripped)

    return split_handled

## Sentiment Analysis of RealMadrid and Liverpool Datasets

In [None]:
mdf = pd.read_csv('madrid_df.csv', encoding='utf8', engine='python')
mdf['created_at'] = pd.to_datetime(mdf.created_at).dt.date
mdf['created_at'] = pd.to_datetime(mdf.created_at, errors='coerce')
del mdf['Unnamed: 0']

# Cleaning the twitter text
mdf.dropna(inplace=True)
mdf['clean_text'] = [tweet_cleaner(t) for t in mdf.txt]
mdf.drop(columns = ['txt'],inplace=True)
mdf.to_csv('RMadrid_clean.csv', encoding = 'utf-8')

In [None]:
mdf = pd.read_csv('RMadrid_clean.csv',index_col=0)
mdf.dropna(inplace=True)

# Applying vaderSentiment analyzer
vader = mdf['clean_text'].apply(lambda x : analyzer.polarity_scores(x))
mdf = pd.concat([mdf,vader.apply(pd.Series)],1)
mdf.to_csv('RMadrid_vader.csv')

In [None]:
ldf = pd.read_csv('liverp_df.csv')
ldf['created_at'] = pd.to_datetime(ldf.created_at).dt.date
ldf['created_at'] = pd.to_datetime(ldf.created_at, errors='coerce')

ldf.head()

# Cleaning the twitter text
ldf.dropna(inplace=True)
ldf['clean_text'] = [tweet_cleaner(t) for t in ldf.txt]
ldf.drop(columns = ['txt'],inplace=True)
ldf.to_csv('Liverp_clean.csv', encoding = 'utf-8')

In [None]:
ldf = pd.read_csv('Liverp_clean.csv', index_col = 0)
ldf.dropna(inplace=True)

# Applying vaderSentiment analyzer
vader = ldf['clean_text'].apply(lambda x : analyzer.polarity_scores(x))
ldf = pd.concat([ldf,vader.apply(pd.Series)],1)
ldf.to_csv('Liverp_vader.csv')

In [3]:
# Splitting dataframes over periods (pre and post), from 21/05/2018 - 16/06/2018
mdf2 = pd.read_csv('RMadrid_vader.csv',index_col=0)
mdf2['period'] = ' '
  
index1 = mdf2.loc[(mdf2['created_at'] >= '2018-05-21') & (mdf2['created_at'] <= '2018-05-25')].index
index3 = mdf2.loc[(mdf2['created_at'] >= '2018-05-27') & (mdf2['created_at'] <= '2018-06-03')].index
index4 = mdf2.loc[(mdf2['created_at'] >= '2018-06-04') & (mdf2['created_at'] <= '2018-06-10')].index
index5 = mdf2.loc[(mdf2['created_at'] >= '2018-06-11') & (mdf2['created_at'] <= '2018-06-16')].index
 
mdf2.loc[index1, 'period'] = 'pre-event'
mdf2.loc[index3, 'period'] = 'post-event-week1'
mdf2.loc[index4, 'period'] = 'post-event-week2'
mdf2.loc[index5, 'period'] = 'post-event-week3'

index6 = mdf2.index[7012:11146]
mdf2.loc[index6, 'period'] = 'pre-event'
index7 = mdf2.index[11146:40326]
mdf2.loc[index7, 'period'] = 'post-event-week1'

ldf2 = pd.read_csv('Liverp_vader.csv',index_col=0)
ldf2['period'] = ' '
  
index1 = ldf2.loc[(ldf2['created_at'] >= '2018-05-21') & (ldf2['created_at'] <= '2018-05-25')].index
index3 = ldf2.loc[(ldf2['created_at'] >= '2018-05-27') & (ldf2['created_at'] <= '2018-06-03')].index
index4 = ldf2.loc[(ldf2['created_at'] >= '2018-06-04') & (ldf2['created_at'] <= '2018-06-10')].index
index5 = ldf2.loc[(ldf2['created_at'] >= '2018-06-11') & (ldf2['created_at'] <= '2018-06-16')].index
 
ldf2.loc[index1, 'period'] = 'pre-event'
ldf2.loc[index3, 'period'] = 'post-event-week1'
ldf2.loc[index4, 'period'] = 'post-event-week2'
ldf2.loc[index5, 'period'] = 'post-event-week3'

index6 = ldf2.index[12119:36560]
ldf2.loc[index6, 'period'] = 'pre-event'
index7 = ldf2.index[36560:170327]
ldf2.loc[index7, 'period'] = 'post-event-week1'

In [4]:
# Analyzing positive/negative/neutral/compound value (mean) and save with independent dataframes
com_mdf2= mdf2.groupby('period', as_index=False)['compound'].mean()
pos_mdf2= mdf2.groupby('period', as_index=False)['pos'].mean()
neg_mdf2= mdf2.groupby('period', as_index=False)['neg'].mean()
neu_mdf2= mdf2.groupby('period', as_index=False)['neu'].mean()

com_ldf2= ldf2.groupby('period', as_index=False)['compound'].mean()
pos_ldf2= ldf2.groupby('period', as_index=False)['pos'].mean()
neg_ldf2= ldf2.groupby('period', as_index=False)['neg'].mean()
neu_ldf2= ldf2.groupby('period', as_index=False)['neu'].mean()

com_mdf2 = com_mdf2.reindex([3,0,1,2])
com_mdf2.reset_index(inplace=True)
del com_mdf2['index']
com_ldf2 = com_ldf2.reindex([3,0,1,2])
com_ldf2.reset_index(inplace=True)
del com_ldf2['index']
pos_mdf2 = pos_mdf2.reindex([3,0,1,2])
pos_mdf2.reset_index(inplace=True)
del pos_mdf2['index']
pos_ldf2 = pos_ldf2.reindex([3,0,1,2])
pos_ldf2.reset_index(inplace=True)
del pos_ldf2['index']
neg_mdf2 = neg_mdf2.reindex([3,0,1,2])
neg_mdf2.reset_index(inplace=True)
del neg_mdf2['index']
neg_ldf2 = neg_ldf2.reindex([3,0,1,2])
neg_ldf2.reset_index(inplace=True)
del neg_ldf2['index']
neu_mdf2 = neu_mdf2.reindex([3,0,1,2])
neu_mdf2.reset_index(inplace=True)
del neu_mdf2['index']
neu_ldf2 = neu_ldf2.reindex([3,0,1,2])
neu_ldf2.reset_index(inplace=True)
del neu_ldf2['index']

In [None]:
mdf2['team'] = 'RealMadrid'
ldf2['team'] = 'Liverpool'
mdf2['period'] = mdf2['period'].map({'pre-event':'pre-event',
                                    'post-event-week1':'post-event',
                                    'post-event-week2':'post-event',
                                    'post-event-week3':'post-event'
                                   })
ldf2['period'] = ldf2['period'].map({'pre-event':'pre-event',
                                    'post-event-week1':'post-event',
                                    'post-event-week2':'post-event',
                                    'post-event-week3':'post-event'
                                   })
mdf2_pre = mdf2.loc[mdf2['period'] == 'pre-event']
mdf2_pos = mdf2.loc[mdf2['period'] == 'post-event']
ldf2_pre = ldf2.loc[ldf2['period'] == 'pre-event']
ldf2_pos = ldf2.loc[ldf2['period'] == 'post-event']

frames = [mdf2_pre, mdf2_pos]
mdf_all = pd.concat(frames)
frames2 = [ldf2_pre, ldf2_pos]
ldf_all = pd.concat(frames2)

grouped_mdf = mdf_all.groupby(['period'])
grouped_ldf = ldf_all.groupby(['period'])

grouped_mdf = grouped_mdf.describe().T.unstack()
grouped_ldf = grouped_ldf.describe().T.unstack()
cols = grouped_mdf.columns.tolist()
cols = cols[8:] + cols[0:8]
grouped_mdf = grouped_mdf[cols]
grouped_ldf = grouped_ldf[cols]

# Emerge dataframes and save to .csv (one set per team)
frames = [mdf2_pre, mdf2_pos]
mdf_all = pd.concat(frames)
frames2 = [ldf2_pre, ldf2_pos]
ldf_all = pd.concat(frames2)

mdf_all.to_csv('rmadridall.csv',encoding='utf-8')
ldf_all.to_csv('liverpall.csv',encoding='utf-8')

## Subjectivity Scores

### Checking compound scores

Vader returns the sentiment value as 3 floating points value in range 0 <= score <= 1, plus one additional compound value that mixes all three in one value in range -1 <= score <= 1. The strength in opinion can be represented by the compound score, which is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). Checking tweets with a small compound score, some tweets are falsely classified as positive or negative but intend to be neutral: 

- 'The number of the child is 23', with neu score of 0.822 and compound score of 0.0772, classified as positive  
- 'pls take pictures with players', with neu score of 0.755 and compound score of 0.0772, classified as positive 
- '31 Teams each year battle against each other to see who will play final against in UEFA', with neu score of 0.762 and compound score of -0.0516, classified as negative
- 'Cr7 n Neymar together can be the beginning of los blancos la liga domination', with neu score of 0.663 and compound score of -0.0772, classified as negative

Tweets with a small compound score usually have a neutral score close to 1, meaning the text is strongly subjective (neither positive nor negative). The study of 'VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text' (Hutto & Gilbert, 2014) applied the standardized thresholds for classifying sentences and set the values of -0.05 and +0.05. Some texts are incorrectly classified when the compound score is close to 0 but slight over the typical values. In this sense, increasing the typical values to 0.1 and -0.1, the texts shown above will be classified as neutral as it should be so that we can get a better accuracy.

- 'The number of the child is 23', with a compound score of 0.0772 will be classified to be neutral
- 'pls take pictures with players', with a compound score of 0.0772 will be classified to be neutral
- '31 Teams each year battle against each other to see who will play final against in UEFA', with a compound score of -0.0516 will be classified as neutral
- 'Cr7 n Neymar together can be the beginning of los blancos la liga domination', with a compound score of -0.0772 will be classified as neutral

In [69]:
mdf2 = pd.read_csv('RMadrid_period.csv',index_col=0)
ldf2 = pd.read_csv('Liverp_period.csv',index_col=0)
mdf2.loc[(mdf2['compound'] >= -0.10) & (mdf2['compound'] <= -0.05)] # examples of false negative/positive involved

Unnamed: 0,created_at,clean_text,compound,neg,neu,pos,period
34,2018-05-21,How could do that atrocity to i feel sorry for him 😠😠😢😢,-0.0772,0.115,0.885,0.000,pre-event
329,2018-05-21,Yea pretty dog shit season....... BUT STILL MORE PEOPLE WATCH US THAN YOU.............. DAHHHHHHHHH,-0.0516,0.140,0.732,0.128,pre-event
607,2018-05-21,Sorry are already with him,-0.0772,0.245,0.755,0.000,pre-event
724,2018-05-21,"My apprehension about Unai Emery emanates from his bad handling of the Neymar/Cavani incident. Secondly, his decisions when lost to and . He seemed bereft of ideas to inspire his charges. They appeared lame duck. But he has good records.",-0.0516,0.197,0.672,0.130,pre-event
728,2018-05-21,Sorry Liverpool after beats on sat our boy is going to break your hearts again on Sunday.,-0.0772,0.075,0.925,0.000,pre-event
793,2018-05-22,sorry I’m not sneak arabic,-0.0772,0.245,0.755,0.000,pre-event
819,2018-05-22,I can't wait until shuts down this pipe dream of yours. I look forward to you swinging your gavel in despair at an unwitting backbencher at the following parliamentary sitting.,-0.0772,0.076,0.858,0.066,pre-event
1013,2018-05-22,simple.\r\r\r\r\nHe is a legend so he will rather have to loose.\r\r\r\r\n is his player and plays for,-0.0772,0.119,0.777,0.104,pre-event
1115,2018-05-22,Hey leave some subtitles below....Everyone isn't a spanish😑,-0.0516,0.167,0.833,0.000,pre-event
1176,2018-05-22,You wouldn’t mess with this fan,-0.0516,0.284,0.455,0.261,pre-event


In [71]:
mdf2 = mdf2.drop(mdf2[(mdf2['compound'] >= 0.05) & (mdf2['compound'] <= 0.10)].index)
mdf2 = mdf2.drop(mdf2[(mdf2['compound'] >= -0.10) & (mdf2['compound'] <= -0.05)].index)
mdf2['compound'] = mdf2['compound'].mask((mdf2['compound'] > -0.10) & (mdf2['compound'] < 0.10), 0)
ldf2 = ldf2.drop(ldf2[(ldf2['compound'] >= 0.05) & (ldf2['compound'] <= 0.10)].index)
ldf2 = ldf2.drop(ldf2[(ldf2['compound'] >= -0.10) & (ldf2['compound'] <= -0.05)].index)
ldf2['compound'] = ldf2['compound'].mask((ldf2['compound'] > -0.05) & (ldf2['compound'] < 0.05), 0)
mdf2 # better accuracy

Unnamed: 0,created_at,clean_text,compound,neg,neu,pos,period
0,2018-05-21,forwarding this sat!,0.0000,0.000,1.000,0.000,pre-event
2,2018-05-21,happy night my life i love you always my idol 👑🐉💞💞💞💞 #GarethBale #gb11 #Legend #fawales #RealMadrid #HalaMadrid,0.8360,0.000,0.639,0.361,pre-event
3,2018-05-21,"Countdown to 2018 Final in Kyiv, Russia. vs #YNWA #6DaysToGo",0.0000,0.000,1.000,0.000,pre-event
4,2018-05-21,Lovee,0.0000,0.000,1.000,0.000,pre-event
5,2018-05-21,This time is lucky\r\r\r\r\nNext title is of \r\r\r\r\n \r\r\r\r\n#HalaMadrid,0.4215,0.000,0.741,0.259,pre-event
6,2018-05-21,Please please!!,0.6351,0.000,0.000,1.000,pre-event
7,2018-05-21,we need you brother #10,0.0000,0.000,1.000,0.000,pre-event
8,2018-05-21,Santos champions next is Madrid 💚💙,0.5267,0.000,0.595,0.405,pre-event
9,2018-05-21,please dm me,0.3182,0.000,0.465,0.535,pre-event
10,2018-05-21,Yes Mo 👌👍,0.4019,0.000,0.426,0.574,pre-event
