# Sentiment Analysis of Financial News

The goal is to determine the sentiment (positive, negative, neutral) of financial news articles and understand how these sentiments affect stock prices or other financial metrics.

# 1. Data Collection

In [1]:
import pandas as pd
import requests

In [2]:
# using NewsAPI to collect financial news
api_key = 'a4565eb1a7d248a496e6c00deae69b14'
url = f"https://newsapi.org/v2/everything?q=finance&apiKey={api_key}"
response = requests.get(url)
data = response.json()

In [3]:
# Convert to DataFrame
articles = data['articles']
df = pd.DataFrame(articles)
df = df[['publishedAt', 'title', 'description', 'content']]
df['text'] = df['title'] + ' ' + df['description'] + ' ' + df['content']
df.drop(columns=['title', 'description', 'content'], inplace=True)

# 2. Data Preprocessing

In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [5]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rohanmuru/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rohanmuru/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rohanmuru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
def preprocess(text):
    # Check if the input is a string, if not, convert it to an empty string
    if not isinstance(text, str):
        text = ""
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = word_tokenize(text)
    # Remove stopwords and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [7]:
df['cleaned_text'] = df['text'].apply(preprocess)

# 3. Feature Extraction

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text']).toarray()

# 4. Model Training

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
# Example sentiment labels (you will need actual sentiment labels for training)
df['sentiment'] = df['cleaned_text'].apply(lambda x: 1 if 'good' in x else 0)

## 4.1 Random Forest

In [12]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

In [13]:
# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [14]:
# Evaluate the model
y_pred_rf = rf_model.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}')
print(classification_report(y_test, y_pred_rf))
print(confusion_matrix(y_test, y_pred_rf))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

[[20]]


## 4.2 LSTM

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2024-05-31 21:20:47.753543: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [16]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])
X = tokenizer.texts_to_sequences(df['cleaned_text'])

In [17]:
# Padding sequences
maxlen = 100
X = pad_sequences(X, padding='post', maxlen=maxlen)

In [18]:
print(f'Shape of padded sequence: {X.shape}')

Shape of padded sequence: (100, 100)


In [19]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.optimizers import Adam

In [21]:
# Define the LSTM model
embedding_dim = 100
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=embedding_dim, input_length=maxlen))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [22]:
# Compile the model
model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])

In [23]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test), verbose=2)

Epoch 1/5
2/2 - 3s - loss: 0.6988 - accuracy: 0.2125 - val_loss: 0.6294 - val_accuracy: 1.0000 - 3s/epoch - 2s/step
Epoch 2/5
2/2 - 0s - loss: 0.6207 - accuracy: 1.0000 - val_loss: 0.5488 - val_accuracy: 1.0000 - 177ms/epoch - 89ms/step
Epoch 3/5
2/2 - 0s - loss: 0.5414 - accuracy: 1.0000 - val_loss: 0.4396 - val_accuracy: 1.0000 - 178ms/epoch - 89ms/step
Epoch 4/5
2/2 - 0s - loss: 0.4290 - accuracy: 1.0000 - val_loss: 0.2602 - val_accuracy: 1.0000 - 178ms/epoch - 89ms/step
Epoch 5/5
2/2 - 0s - loss: 0.2427 - accuracy: 1.0000 - val_loss: 0.0493 - val_accuracy: 1.0000 - 167ms/epoch - 84ms/step


In [24]:
# Evaluate the model
y_pred_lstm = (model.predict(X_test) > 0.5).astype("int32")
print(f'LSTM Accuracy: {accuracy_score(y_test, y_pred_lstm)}')
print(classification_report(y_test, y_pred_lstm))
print(confusion_matrix(y_test, y_pred_lstm))

LSTM Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20

    accuracy                           1.00        20
   macro avg       1.00      1.00      1.00        20
weighted avg       1.00      1.00      1.00        20

[[20]]
