## Stock Market Prediction Via NLP Sentiment Analysis

In [46]:
import pandas as pd

### Load Data

In [49]:
cnbc_df = pd.read_csv('./data/cnbc_data_truncated.csv')
cnbc_df

Unnamed: 0,Date,Summary
0,2024-06-03,All three major averages wrapped up a solid mo...
1,2024-06-04,Paramount and Skydance have agreed to terms of...
2,2024-06-05,Elon Musk ordered Nvidia to send thousands of ...
3,2024-06-06,The S&P 500 and the Nasdaq Composite hit new r...
4,2024-06-07,The indexes closed with little movement Thursd...
...,...,...
101,2024-10-25,A federal judge has blocked the proposed merge...
102,2024-10-28,"Futures are trading higher Monday morning, led..."
103,2024-10-29,JPMorgan Chase filed lawsuits against customer...
104,2024-10-30,Alphabet reported third-quarter earnings resul...


In [51]:
stock_df = pd.read_csv('./data/stock_data_truncated.csv')
stock_df

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,0,2024-10-31,5775.34,5775.34,5702.86,5705.45,5705.45,4425660000
1,1,2024-10-30,5832.65,5850.94,5811.28,5813.67,5813.67,3851120000
2,2,2024-10-29,5819.68,5847.19,5802.17,5832.92,5832.92,3879100000
3,3,2024-10-28,5833.93,5842.92,5823.08,5823.52,5823.52,3691280000
4,4,2024-10-25,5826.75,5862.82,5799.98,5808.12,5808.12,3501280000
...,...,...,...,...,...,...,...,...
101,101,2024-06-07,5343.81,5375.08,5331.33,5346.99,5346.99,3692760000
102,102,2024-06-06,5357.80,5362.35,5335.36,5352.96,5352.96,3609990000
103,103,2024-06-05,5314.48,5354.16,5297.64,5354.03,5354.03,3591460000
104,104,2024-06-04,5278.24,5298.80,5257.63,5291.34,5291.34,3707900000


In [53]:
merged_df = pd.merge(cnbc_df, stock_df, on="Date", how="inner")
merged_df.head()

Unnamed: 0.1,Date,Summary,Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
0,2024-06-03,All three major averages wrapped up a solid mo...,105,5297.15,5302.11,5234.32,5283.4,5283.4,4046920000
1,2024-06-04,Paramount and Skydance have agreed to terms of...,104,5278.24,5298.8,5257.63,5291.34,5291.34,3707900000
2,2024-06-05,Elon Musk ordered Nvidia to send thousands of ...,103,5314.48,5354.16,5297.64,5354.03,5354.03,3591460000
3,2024-06-06,The S&P 500 and the Nasdaq Composite hit new r...,102,5357.8,5362.35,5335.36,5352.96,5352.96,3609990000
4,2024-06-07,The indexes closed with little movement Thursd...,101,5343.81,5375.08,5331.33,5346.99,5346.99,3692760000


In [55]:
merged_df['PctChg'] = merged_df['Close'].pct_change() * 100
merged_df = merged_df.dropna(subset=['PctChg']).reset_index(drop=True)
merged_df.head()

Unnamed: 0.1,Date,Summary,Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,PctChg
0,2024-06-04,Paramount and Skydance have agreed to terms of...,104,5278.24,5298.8,5257.63,5291.34,5291.34,3707900000,0.150282
1,2024-06-05,Elon Musk ordered Nvidia to send thousands of ...,103,5314.48,5354.16,5297.64,5354.03,5354.03,3591460000,1.184766
2,2024-06-06,The S&P 500 and the Nasdaq Composite hit new r...,102,5357.8,5362.35,5335.36,5352.96,5352.96,3609990000,-0.019985
3,2024-06-07,The indexes closed with little movement Thursd...,101,5343.81,5375.08,5331.33,5346.99,5346.99,3692760000,-0.111527
4,2024-06-10,Stock futures slipped Monday morning after pol...,100,5341.22,5365.79,5331.52,5360.79,5360.79,3622280000,0.258089


In [57]:
merged_df['Label'] = (merged_df['PctChg'] >= 0).astype(int)
merged_df.head()

Unnamed: 0.1,Date,Summary,Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,PctChg,Label
0,2024-06-04,Paramount and Skydance have agreed to terms of...,104,5278.24,5298.8,5257.63,5291.34,5291.34,3707900000,0.150282,1
1,2024-06-05,Elon Musk ordered Nvidia to send thousands of ...,103,5314.48,5354.16,5297.64,5354.03,5354.03,3591460000,1.184766,1
2,2024-06-06,The S&P 500 and the Nasdaq Composite hit new r...,102,5357.8,5362.35,5335.36,5352.96,5352.96,3609990000,-0.019985,0
3,2024-06-07,The indexes closed with little movement Thursd...,101,5343.81,5375.08,5331.33,5346.99,5346.99,3692760000,-0.111527,0
4,2024-06-10,Stock futures slipped Monday morning after pol...,100,5341.22,5365.79,5331.52,5360.79,5360.79,3622280000,0.258089,1


### Establish Baseline

In [60]:
label_counts = merged_df['Label'].value_counts()
print(label_counts)

Label
1    61
0    44
Name: count, dtype: int64


In [62]:
#The baseline is calculated by simply guessing the majority outcome.
baseline = 61/105
baseline

0.580952380952381

### Experiment with models and hyperparameters

In [87]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

In [89]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text using lemmatizing
def preprocess_with_lemmatizing(text):
    tokens = nltk.word_tokenize(text.lower())
    return " ".join([lemmatizer.lemmatize(word) for word in tokens if word.isalnum()])

# Apply preprocessing
merged_df['Summary_Lemmatized'] = merged_df['Summary'].apply(preprocess_with_lemmatizing)

# Tokenization
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(merged_df['Summary_Lemmatized'])
sequences = tokenizer.texts_to_sequences(merged_df['Summary_Lemmatized'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Encode labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(merged_df['Label'])

[nltk_data] Downloading package punkt to /Users/odeanmaye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/odeanmaye/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/odeanmaye/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [91]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

In [99]:
# Build the model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32, return_sequences=True)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/20




ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None,), output.shape=(None, 100, 1)

In [95]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.4286 - loss: 0.7123
Test Accuracy: 0.4285714328289032


In [28]:
model.save('text_classification_model.h5')

In [30]:
predictions = model.predict(X_test)
predictions = (predictions > 0.5).astype(int)

Unnamed: 0,Preprocessing,Classifier,Best Parameters,Accuracy,Report
0,CountVectorizer,LogisticRegression,{},0.628205,precision recall f1-score ...
1,CountVectorizer,DecisionTreeClassifier,{},0.602564,precision recall f1-score ...
2,CountVectorizer,MultinomialNB,{},0.564103,precision recall f1-score ...
3,TfidfVectorizer,LogisticRegression,{},0.641026,precision recall f1-score ...
4,TfidfVectorizer,DecisionTreeClassifier,{},0.589744,precision recall f1-score ...
5,TfidfVectorizer,MultinomialNB,{},0.666667,precision recall f1-score ...
6,CountVectorizer,LogisticRegression,{},0.641026,precision recall f1-score ...
7,CountVectorizer,DecisionTreeClassifier,{},0.717949,precision recall f1-score ...
8,CountVectorizer,MultinomialNB,{},0.538462,precision recall f1-score ...
9,TfidfVectorizer,LogisticRegression,{},0.641026,precision recall f1-score ...
