In [103]:
# import necessary libraries
import string
import re

import numpy as np
import pandas as pd

# models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from gensim.models import Word2Vec

# optimization
from sklearn.model_selection import GridSearchCV
from hyperopt import fmin, tpe, hp # to use with gensim models


# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/minrei/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/minrei/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/minrei/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/minrei/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [73]:
# load data
df = pd.read_csv("stock_data.csv")
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


# Preprocessing

1. Cleaning and normalizing.
2. Remove stop words
3. Stemming and lemmatization
4. Create a vocabulary

In [74]:
# Cleans & normalizes data
def preprocess_data(text):
    # Remove punctuation and special characters
    text = "".join([char for char in text if char not in string.punctuation])

    # Lowercase all the words
    text = text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    tokens = [token for token in tokens if token not in stopwords.words("english")]
    
    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Return the cleaned, normalized, and pre-processed text
    return " ".join(tokens)

df["cleaned_text"] = df["Text"].apply(preprocess_data)
df.head()

Unnamed: 0,Text,Sentiment,cleaned_text
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1,kicker watchlist xide tit soq pnk cpw bpz aj t...
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1,user aap movie 55 return feageed indicator 15 ...
2,user I'd be afraid to short AMZN - they are lo...,1,user id afraid short amzn looking like nearmon...
3,MNTA Over 12.00,1,mnta 1200
4,OI Over 21.37,1,oi 2137


In [75]:
# Create vocabulary from processed words
def create_vocabulary(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    vocabulary = set(tokens)

    return vocabulary

# Build vocabulary
vocabulary = set()
for vocab in df["cleaned_text"].apply(create_vocabulary):
    vocabulary = vocabulary.union(vocab)

len(vocabulary)

10937

# Choosing a model architecture

We shall begin with a basic bag-of-words model. This involves representing each comment as a feature vector, where each element of the vector is the count of a specific word in the vocabulary. Next, we train a classification model such as logistic regression, on the feature vectors and their corresponding labels to make predictions about the sentiment of new comments.

In [76]:
# Create a bag-of-words representation of the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["cleaned_text"])

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, df["Sentiment"], test_size=0.2)

# Fit train data to a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate performance on test data
score = model.score(X_test, y_test)
score

(4632, 10861)
(1159, 10861)
(4632,)
(1159,)


0.7739430543572045

## Better models

Let us examine if we can achieve a higher score(accuracy) than 0.77. We consider two other model architectures: TF-IDF and word embeddings.

The former improves on the bag-of-words model by assigning higher weights to more informative and unique words, and lower weights to common words. 

The latter represents words in a low-dimensional space. These vectors capture the semantic and syntactic *relationships* between words.

Since TF-IDF is a statistical method based on the frequency of words, it is simpler and interpretable compared to the more complex black-box model of word embeddings.

However, TF-IDF ignores the word order and gramamtical structure while word embeddings capture the context and meaning of words in a sentence.

Therefore, we shall use *word embeddings* to perform our binary sentiment analysis.

In [160]:
# Create a list of sentences
sentences = df["cleaned_text"].str.split()

# Define hyperparameter space for the Word2Vec model
space = {
    # "min_count": hp.quniform("min_count", 1, 3, 1),
    "window": hp.quniform("window", 3, 7, 2),
    "sg": hp.choice("sg", [0, 1]),
    "workers": hp.quniform("workers", 4, 16, 2)
}
# Define the objective function for the Word2Vec model
def objective(params):
    # Create an instance of the Word2Vec model with the given hyperparameters
    w2v = Word2Vec(sentences=sentences, min_count=1, window=params["window"], sg=params["sg"], workers=params["workers"])

    w2v.train(sentences, epochs=w2v.epochs, total_examples=w2v.corpus_count)

    X = []
    for sentence in sentences:
        vector = w2v.wv[sentence].mean(axis=0)
        X.append(vector)

    X_train, X_test, y_train, y_test = train_test_split(X, df["Sentiment"], test_size=0.2)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    # Evaluate
    score = model.score(X_test, y_test)
    return -score


# Use the fmin function to find the optimal values for the hyperparameters
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100)

100%|██████████| 100/100 [01:55<00:00,  1.16s/trial, best loss: -0.7144089732528042]


## Optimization

In spite of hyperoptimization, the best we achieve is an accuracy of 0.71 with our word embedding model. 

In [161]:
print(best)

{'sg': 1, 'window': 6.0, 'workers': 14.0}
