In [1]:
# Necessary import libs
import pandas as pd
import nltk
import pickle
import praw

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Import sample for model building
df = pd.read_csv('stock_data.csv')

In [3]:
# Stem, Remove stopwords, and then tokenize every word into a preprocessed information
ps = PorterStemmer()

preprocessed_text = []

for text in df['Text']:
    tokens = word_tokenize(text.lower())
    tokens = [ps.stem(token) for token in tokens if token.isalpha()]
    preprocessed_text.append(' '.join(tokens))

df['Preprocessed Text'] = preprocessed_text

# Apply the Sentiment Analyzer
sid = SentimentIntensityAnalyzer()

sentiment_labels = []

# Loop through all of the text and assign labels as 'Sentiment'
for text in df['Preprocessed Text']:
    scores = sid.polarity_scores(text)
    if scores['compound'] > 0:
        sentiment_labels.append('positive')
    elif scores['compound'] < 0:
        sentiment_labels.append('negative')
    else:
        sentiment_labels.append('neutral')

df['Sentiment'] = sentiment_labels

In [4]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Text'], df['Sentiment'], test_size=0.2, random_state=42)

In [5]:
# We vectorize the words into numerical features
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# And train a logistic regression model with those features
clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

# Save your model for later implementation
with open('model.pkl', 'wb') as f:
    pickle.dump(clf, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [6]:
# Make your predictions and see how we did
y_pred = clf.predict(X_test_vec)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8524590163934426
Precision: 0.8462566420735208
Recall: 0.8170836492913659
F1-score: 0.827668449236843
