# Sentiment Analysis

In this notebook we train a SGDClassifier on the embeddings of positive and negative words. Then given a sentence the sentiment analysis is done by classifying each word in the sentence as positive or negative. The final outcome is produced by aggregating the resulting sentiment of each word in this sentence.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
cd drive/My\ Drive/Colab\ Notebooks

/content/drive/My Drive/Colab Notebooks


In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
def load_embeddings(filename):
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in tqdm(enumerate(infile)):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

# embeddings_big = load_embeddings('glove.42B.300d.txt') # big glove
embeddings_small = load_embeddings('glove.6B.50d.txt') # small glove
embeddings.shape

400000it [00:07, 54312.08it/s]


(400000, 50)

In [None]:
# embeddings = embeddings_big.copy()
embeddings = embeddings_small.copy()

In [None]:
pos_words = pd.read_csv('moodstock_data/Positive-words.tsv', header=None, sep='\t')
neg_words = pd.read_csv('moodstock_data/Negative-words.tsv', header=None, sep='\t')

In [None]:
pos_words[0] = pos_words[0].str.lower()
neg_words[0] = neg_words[0].str.lower()

In [None]:
pos_vectors = embeddings.loc[pos_words.values[:,0]].dropna()
neg_vectors = embeddings.loc[neg_words.values[:,0]].dropna()

In [None]:
vectors = pd.concat([pos_vectors, neg_vectors])
targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
labels = list(pos_vectors.index) + list(neg_vectors.index)

In [None]:
from sklearn.model_selection import train_test_split

train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
    train_test_split(vectors, targets, labels, test_size=0.1, random_state=12)

## SGD Classifier

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import SGDClassifier

In [None]:
model = SGDClassifier(loss='log', random_state=32, max_iter=100)
model.fit(vectors, targets)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=100,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=32, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [None]:
y_hat = model.predict(test_vectors)
accuracy_score(y_true=test_targets, y_pred=y_hat)

0.9507575757575758

In [None]:
import pickle
model = pickle.load(open('sentiment_analisys.sav', 'rb'))

In [None]:
import re
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.

def vecs_to_sentiment(vecs):
    # predict_log_proba gives the log probability for each class
    predictions = model.predict_log_proba(vecs)
    # To see an overall positive vs. negative classification in one number,
    # we take the log probability of positive sentiment minus the log
    # probability of negative sentiment.
    return predictions[:, 1] - predictions[:, 0]


def words_to_sentiment(words):
    vecs = embeddings.loc[words].dropna()
    log_odds = vecs_to_sentiment(vecs)
    return log_odds


def text_to_sentiment(tokens, print_sent=False):
    sentiments = words_to_sentiment(tokens)
    mean = sentiments.mean()

    if print_sent:
        print(mean)

    if mean >= 0:
        return 1
    else:
        return -1

In [None]:
def text_to_sentiment_norm(tokens, print_sent=False):
    sentiments = words_to_sentiment(tokens)
    pos = sum([1 for s in sentiments if s >= 0])
    neg = sum([1 for s in sentiments if s < 0])

    if print_sent:
    print(sentiments)

    if pos >= neg:
        return 1
    else:
        return -1


## Preprocess


In [None]:
import re
import string
from nltk.tokenize import word_tokenize
import emoji
import datefinder
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('wordnet')
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def remove_emojies(word):
    return emoji.demojize(word)


def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def remove_apostrophe(word):
    return decontracted(word)

def substitute_company_names(phrase):
    phrase = re.sub(r"\$[A-Za-z]+", "ORG", phrase)
    phrase = re.sub("Netflix", "ORG", phrase)
    phrase = re.sub("Disney", "ORG", phrase)
    phrase = re.sub("Apple", "ORG", phrase)
    phrase = re.sub("Alphabet", "ORG", phrase)
    phrase = re.sub("Tesla", "ORG", phrase)
    phrase = re.sub("Facebook", "ORG", phrase)
    phrase = re.sub("netflix", "ORG", phrase)
    phrase = re.sub("disney", "ORG", phrase)
    phrase = re.sub("baba", "ORG", phrase)
    phrase = re.sub("apple", "ORG", phrase)
    phrase = re.sub("alphabet", "ORG", phrase)
    phrase = re.sub("tesla", "ORG", phrase)
    phrase = re.sub("facebook", "ORG", phrase)
    phrase = re.sub("SPY", "ORG", phrase)
    phrase = re.sub("spy", "ORG", phrase)
    phrase = re.sub("spy500", "ORG", phrase)
    phrase = re.sub("S&P", "ORG", phrase)
    phrase = re.sub("S&P500", "ORG", phrase)
    phrase = re.sub("s&p500", "ORG", phrase)
    return phrase

def substitute_date(phrase):
    matches = datefinder.find_dates(phrase,source=True,index=True)
    indices=[]
    number_dates=0
    number_dates2=0
    y=0
    try:
        for match in matches:
            indices.append(match[2])
        for i in indices:
            if number_dates==0:
                phrase = phrase[:i[0]] + "  Date  " + phrase[i[1]:]
            else:
                phrase = phrase[:i[0]-number_dates2+len("  Date  ")*y] + "  Date  " + phrase[i[1]-number_dates2+len("  Date  ")*y:]
                
            number_dates=+i[1]-i[0]
            number_dates2=number_dates+number_dates2
            y+=1
    finally:
        return phrase

def substitute_prices(word):
    return re.sub(r"\$[0-9]+", "price", word)

def substitute_entities(phrase):
    nlp = en_core_web_sm.load()
    doc = nlp(phrase)
    newString = phrase
    for e in reversed(doc.ents):
        start = e.start_char
        end = start + len(e.text)
        newString = newString[:start] + e.label_.lower() + newString[end:]
    return newString

def remove_entities(words):
    entities=['cardinal', 'date', 'event', 'price', 'org', 'fac', 'gpe', 'language', 'law', 'loc', 'money', 'norp', 'ordinal', 'percent', 'person', 'product', 'quantity', 'time', 'work_of_art']
    return [i for i in words if i not in entities]

def remove_single_letters(words):
    return [i for i in words if len(i) > 1]

def remove_hyperlink(word):
    return  re.sub(r"http\S+", "", word)


def to_lower(word):
    result = word.lower()
    return result


def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result


def remove_punctuation(word):
    # result = word.translate(str.maketrans(dict.fromkeys(string.punctuation)))
    return re.sub(r'[^A-Za-z]', ' ', word)


def remove_whitespace(word):
    result = word.strip()
    return result


def replace_newline(word):
    return word.replace('\n','')

def remove_stop_words(words):
    result = [i for i in words if i not in ENGLISH_STOP_WORDS]
    return result

def word_stemmer(words):
    return [stemmer.stem(o) for o in words]

def word_lemmatizer(words):
    return [lemmatizer.lemmatize(o) for o in words]

def filtering_pipeline(sentence):
    cleaning_utils = [substitute_date,
                      to_lower,
                      substitute_company_names,
                      substitute_prices,
                      remove_hyperlink,
                      remove_number,
                      remove_emojies,
                      remove_apostrophe,
                      remove_punctuation,
                      substitute_entities,
                      replace_newline,
                      remove_whitespace]

    cleaning_tokens = [remove_stop_words,
                       word_lemmatizer]
    for o in cleaning_utils:
        sentence = o(sentence)
    sentence = word_tokenize(sentence)
    for i in cleaning_tokens:
        sentence = i(sentence)
    return sentence

def sentiment_analysis_pipeline(tokens):
    tokens = remove_entities(tokens)
    return remove_single_letters(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load Tweets


In [None]:
tweets_df = pd.read_csv('moodstock_data/MoodstockDataset-Sentiment-9839.csv', index_col=1)
tweets_df.loc[tweets_df['Label'] == -2, 'Sentiment'] *= -1
tweets_df_no_spam = tweets_df.drop(tweets_df.loc[tweets_df['Label'] == -1].index, axis=0)
binary_tweets = tweets_df_no_spam.drop(tweets_df.loc[tweets_df['Sentiment'] == 0].index, axis=0, errors='ignore')
binary_tweets.drop(9959, axis=0,inplace=True)

# tweets_df_all_agree = pd.read_csv('moodstock_data/Sentences_AllAgree.txt', sep='@', encoding='latin-1', header=None)
# tweets_df_all_agree.columns = ['Text','Sentiment']
# tweets_df_all_agree.loc[tweets_df_all_agree['Sentiment'] == 'positive', 'Sentiment'] = 1
# tweets_df_all_agree.loc[tweets_df_all_agree['Sentiment'] == 'negative', 'Sentiment'] = -1
# tweets_df_all_agree.tail()

In [None]:
binary_tweets.iloc[92]

Label                                           0
Text         $BABA\nshredddd\nover $200 explosion
Sentiment                                       1
Name: 9961, dtype: object

In [None]:
x_tweets = binary_tweets['Text'].values
y_tweets = binary_tweets['Sentiment'].values

x_tweets = [filtering_pipeline(o) for o in x_tweets]
x_tweets = [sentiment_analysis_pipeline(o) for o in x_tweets]

## Test on tweets SGD

In [None]:
y_hat = []
for i,sample in enumerate(x_tweets):
    y_hat.append(text_to_sentiment_norm(sample))

In [None]:
len(y_hat)

92

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true=y_tweets, y_pred=y_hat)

0.8214285714285714

In [None]:
from sklearn.metrics import precision_score, recall_score
print(precision_score(y_true=y_tweets, y_pred=y_hat))
recall_score(y_true=y_tweets, y_pred=y_hat)


0.925531914893617


0.87

In [None]:
filtering_pipeline('$BABA Stock Increased 2.3% to 199.10. The Largest Options Open Interest is on the 17-Jan-20 200 Call with 35,672(OI)')

['ORG', 'stock', 'increased', 'date', 'largest', 'option', 'open', 'jan', 'oi']

In [None]:
# substitute_date('17/11/20 200')
matches = datefinder.find_dates('created 17-Jan-20 by ACME Inc. and associates.')
for match in matches:
    print(match)

2020-01-17 00:00:00


In [None]:
idx = np.where(y_hat != y_tweets)[0]
for i in idx:
    print(binary_tweets.iloc[i][['Text', 'Sentiment']].values[0])
    print(binary_tweets.iloc[i][['Text', 'Sentiment']].values[1])
    print(np.array(x_tweets)[i])
    print()

In [None]:
text_to_sentiment_norm(['increased', 'far', 'nearing'], True)

           sentiment
increased   0.822870
far         0.904636
nearing     0.670365


1

In [None]:
text_to_sentiment_norm(['very','angry'], True)

['very', 'angry']
[  3.87275611 -11.04012794]
[  3.87275611 -11.04012794]


1