# Load Datasets into Pandas

In [None]:
import pandas as pd

df_news_sentiment = pd.read_csv("/content/Sentiment_Analysis_Financial_News.csv", encoding='latin-1')
df_yahoo_finance_2018_2023 = pd.read_csv("/content/Yahoo_Finance_2018_2023.csv", encoding='latin-1')
df_stock_news_sentiment = pd.read_csv("/content/Stock_News_Sentiment_Analysis.csv", encoding='latin-1')
df_massive_yahoo_finance = pd.read_csv("/content/Massive_Yahoo_Finance.csv", encoding='latin-1')

print("Financial News Sentiment Dataset:")
display(df_news_sentiment.head())
print("\nYahoo Finance Dataset (2018-2023):")
display(df_yahoo_finance_2018_2023.head())
print("\nStock News Sentiment Analysis Dataset:")
display(df_stock_news_sentiment.head())
print("\nMassive Yahoo Finance Dataset:")
display(df_massive_yahoo_finance.head())

Financial News Sentiment Dataset:


Unnamed: 0,neutral,"According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing ."
0,neutral,Technopolis plans to develop in stages an area...
1,negative,The international electronic industry company ...
2,positive,With the new production plant the company woul...
3,positive,According to the company 's updated strategy f...
4,positive,FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...



Yahoo Finance Dataset (2018-2023):


Unnamed: 0,ï»¿Date,Open,High,Low,Close*,Adj Close**,Volume
0,"Apr 28, 2023",33797.43,34104.56,33728.4,34098.16,34098.16,354310000
1,"Apr 27, 2023",33381.66,33859.75,33374.65,33826.16,33826.16,343240000
2,"Apr 26, 2023",33596.34,33645.83,33235.85,33301.87,33301.87,321170000
3,"Apr 25, 2023",33828.34,33875.49,33525.39,33530.83,33530.83,297880000
4,"Apr 24, 2023",33805.04,33891.15,33726.09,33875.4,33875.4,252020000



Stock News Sentiment Analysis Dataset:


Unnamed: 0.1,Unnamed: 0,Sentiment,Sentence
0,0,0,"According to Gran , the company has no plans t..."
1,1,1,"For the last quarter of 2010 , Componenta 's n..."
2,2,1,"In the third quarter of 2010 , net sales incre..."
3,3,1,Operating profit rose to EUR 13.1 mn from EUR ...
4,4,1,"Operating profit totalled EUR 21.1 mn , up fro..."



Massive Yahoo Finance Dataset:


Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Company
0,2018-11-29 00:00:00-05:00,43.829761,43.863354,42.639594,43.083508,167080000,0.0,0.0,AAPL
1,2018-11-29 00:00:00-05:00,104.769074,105.519257,103.534595,104.636131,28123200,0.0,0.0,MSFT
2,2018-11-29 00:00:00-05:00,54.176498,55.0075,54.099998,54.729,31004000,0.0,0.0,GOOGL
3,2018-11-29 00:00:00-05:00,83.749496,84.499496,82.616501,83.678497,132264000,0.0,0.0,AMZN
4,2018-11-29 00:00:00-05:00,39.692784,40.064904,38.735195,39.037853,54917200,0.04,0.0,NVDA


# Exploratory Data Analysis (EDA)

In [None]:
print("Column Names in Each Dataset:\n")
print("News Sentiment:", df_news_sentiment.columns)
print("Yahoo Finance 2018-2023:", df_yahoo_finance_2018_2023.columns)
print("Stock News Sentiment:", df_stock_news_sentiment.columns)
print("Massive Yahoo Finance:", df_massive_yahoo_finance.columns)

print("\nMissing Values in Each Dataset:\n")
print("News Sentiment:\n", df_news_sentiment.isnull().sum())
print("\nYahoo Finance 2018-2023:\n", df_yahoo_finance_2018_2023.isnull().sum())
print("\nStock News Sentiment:\n", df_stock_news_sentiment.isnull().sum())
print("\nMassive Yahoo Finance:\n", df_massive_yahoo_finance.isnull().sum())


Column Names in Each Dataset:

News Sentiment: Index(['neutral', 'According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .'], dtype='object')
Yahoo Finance 2018-2023: Index(['ï»¿Date', 'Open', 'High', 'Low', 'Close*', 'Adj Close**', 'Volume'], dtype='object')
Stock News Sentiment: Index(['Unnamed: 0', 'Sentiment', 'Sentence'], dtype='object')
Massive Yahoo Finance: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Company'],
      dtype='object')

Missing Values in Each Dataset:

News Sentiment:
 neutral                                                                                                                            0
According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .    0
dtype: int64

Yahoo Finance 2018-2023:
 ï»¿Date        1
Open           1
High           1
Low            1
Close*   

# Data Cleaning & Preprocessing

In [None]:
df_news_sentiment.columns = ["Sentiment", "News_Headline"]

df_yahoo_finance_2018_2023.rename(columns={
    "ï»¿Date": "Date", "Close*": "Close", "Adj Close**": "Adj_Close"
}, inplace=True)

df_stock_news_sentiment.drop(columns=["Unnamed: 0"], inplace=True)

df_yahoo_finance_2018_2023["Date"] = pd.to_datetime(df_yahoo_finance_2018_2023["Date"],utc=True)
df_massive_yahoo_finance["Date"] = pd.to_datetime(df_massive_yahoo_finance["Date"],utc =True)

df_yahoo_finance_2018_2023.dropna(inplace=True)
df_stock_news_sentiment.dropna(inplace=True)

print("Cleaned Column Names:\n")
print("News Sentiment:", df_news_sentiment.columns)
print("Yahoo Finance 2018-2023:", df_yahoo_finance_2018_2023.columns)
print("Stock News Sentiment:", df_stock_news_sentiment.columns)
print("Massive Yahoo Finance:", df_massive_yahoo_finance.columns)


Cleaned Column Names:

News Sentiment: Index(['Sentiment', 'News_Headline'], dtype='object')
Yahoo Finance 2018-2023: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Adj_Close', 'Volume'], dtype='object')
Stock News Sentiment: Index(['Sentiment', 'Sentence'], dtype='object')
Massive Yahoo Finance: Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Company'],
      dtype='object')


In [None]:
for df in [df_yahoo_finance_2018_2023, df_massive_yahoo_finance]:
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df["Weekday"] = df["Date"].dt.weekday

print("News Sentiment Distribution:\n", df_news_sentiment["Sentiment"].value_counts())
print("\nStock News Sentiment Distribution:\n", df_stock_news_sentiment["Sentiment"].value_counts())

News Sentiment Distribution:
 Sentiment
neutral     2878
positive    1363
negative     604
Name: count, dtype: int64

Stock News Sentiment Distribution:
 Sentiment
1    55724
0    53026
Name: count, dtype: int64


# Text Preprocessing (NLP Pipeline)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("punkt_tab")
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

df_news_sentiment["Cleaned_News"] = df_news_sentiment["News_Headline"].apply(clean_text)
df_stock_news_sentiment["Cleaned_Sentence"] = df_stock_news_sentiment["Sentence"].apply(clean_text)


print(df_news_sentiment[["News_Headline", "Cleaned_News"]].head())
print(df_stock_news_sentiment[["Sentence", "Cleaned_Sentence"]].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                       News_Headline  \
0  Technopolis plans to develop in stages an area...   
1  The international electronic industry company ...   
2  With the new production plant the company woul...   
3  According to the company 's updated strategy f...   
4  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...   

                                        Cleaned_News  
0  technopolis plan develop stage area less squar...  
1  international electronic industry company elco...  
2  new production plant company would increase ca...  
3  according company updated strategy year baswar...  
4  financing aspocomp growth aspocomp aggressivel...  
                                            Sentence  \
0  According to Gran , the company has no plans t...   
1  For the last quarter of 2010 , Componenta 's n...   
2  In the third quarter of 2010 , net sales incre...   
3  Operating profit rose to EUR 13.1 mn from EUR ...   
4  Operating profit totalled EUR 21.1 mn , up fro... 

# Feature Engineering

 Task 1: Convert News Sentences into TF-IDF Vectors

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)

X_tfidf_news = tfidf_vectorizer.fit_transform(df_news_sentiment["Cleaned_News"])
X_tfidf_stock = tfidf_vectorizer.transform(df_stock_news_sentiment["Cleaned_Sentence"])

import pandas as pd
tfidf_df_news = pd.DataFrame(X_tfidf_news.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df_stock = pd.DataFrame(X_tfidf_stock.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df_news.head())
print(tfidf_df_stock.head())


    ab  abb  abc  ability  able  abloy  abn  abp  abroad   ac  ...  zero  \
0  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.0  ...   0.0   
1  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.0  ...   0.0   
2  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.0  ...   0.0   
3  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.0  ...   0.0   
4  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.0  ...   0.0   

   zgody  zinc  zinclead  zip  zloty  zoltan  zone  zoo  zte  
0    0.0   0.0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  
1    0.0   0.0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  
2    0.0   0.0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  
3    0.0   0.0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  
4    0.0   0.0       0.0  0.0    0.0     0.0   0.0  0.0  0.0  

[5 rows x 5000 columns]
    ab  abb  abc  ability  able  abloy  abn  abp  abroad   ac  ...      zero  \
0  0.0  0.0  0.0      0.0   0.0    0.0  0.0  0.0     0.0  0.

 Task 2: Convert News Sentences into Word2Vec Embeddings

In [None]:
from gensim.models import Word2Vec

tokenized_news = [sentence.split() for sentence in df_news_sentiment["Cleaned_News"]]
tokenized_stock = [sentence.split() for sentence in df_stock_news_sentiment["Cleaned_Sentence"]]

w2v_model = Word2Vec(sentences=tokenized_news + tokenized_stock, vector_size=100, window=5, min_count=2, workers=4)

def get_average_w2v(sentence):
    words = sentence.split()
    vector = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    return sum(vector) / len(vector) if vector else [0] * 100

df_news_sentiment["Word2Vec_Embedding"] = df_news_sentiment["Cleaned_News"].apply(get_average_w2v)
df_stock_news_sentiment["Word2Vec_Embedding"] = df_stock_news_sentiment["Cleaned_Sentence"].apply(get_average_w2v)

print(df_news_sentiment[["Word2Vec_Embedding"]].head())
print(df_stock_news_sentiment[["Word2Vec_Embedding"]].head())


                                  Word2Vec_Embedding
0  [-0.8845273, -0.3170042, -0.14528582, -0.04821...
1  [-0.8032433, -0.34212357, -0.28852963, -0.0695...
2  [-1.3656639, -0.3422218, -0.35618526, -0.45349...
3  [-1.6733301, -1.0615385, -0.07469585, -0.20160...
4  [-0.6506081, -0.20488225, -0.10921697, -0.0948...
                                  Word2Vec_Embedding
0  [-1.0681955, -0.58142173, -0.010430786, -0.363...
1  [-1.7127973, -0.94873494, -0.48745197, -0.0166...
2  [-2.728956, -1.9643903, -0.64130604, 0.0505127...
3  [-2.506668, -1.8205723, -0.64443004, 0.2000257...
4  [-2.5212386, -1.9828606, -0.5259349, 0.1741440...


#  Train Sentiment Classification Models

Task 1: Train Logistic Regression with TF-IDF

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

X_news = X_tfidf_news
y_news = df_news_sentiment["Sentiment"]

X_train, X_test, y_train, y_test = train_test_split(X_news, y_news, test_size=0.2, random_state=42)

log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)

y_pred = log_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.7285861713106295
Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.40      0.53       115
     neutral       0.72      0.94      0.81       567
    positive       0.77      0.44      0.56       287

    accuracy                           0.73       969
   macro avg       0.75      0.59      0.63       969
weighted avg       0.74      0.73      0.70       969



Task 2: Train Deep Learning Model with Word2Vec

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

X_news_w2v = np.vstack(df_news_sentiment["Word2Vec_Embedding"])
y_news = df_news_sentiment["Sentiment"].replace({'neutral': 0, 'positive': 1, 'negative': 2})  # Convert to numerical labels

X_train, X_test, y_train, y_test = train_test_split(X_news_w2v, y_news, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, activation="relu", input_shape=(100,)),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dropout(0.3),
    Dense(3, activation="softmax")
])


model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

loss, accuracy = model.evaluate(X_test, y_test)
print("Neural Network Accuracy:", accuracy)


  y_news = df_news_sentiment["Sentiment"].replace({'neutral': 0, 'positive': 1, 'negative': 2})  # Convert to numerical labels
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - accuracy: 0.6310 - loss: 0.9109 - val_accuracy: 0.6202 - val_loss: 0.8401
Epoch 2/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.6258 - loss: 0.8635 - val_accuracy: 0.6192 - val_loss: 0.8308
Epoch 3/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.6393 - loss: 0.8226 - val_accuracy: 0.6254 - val_loss: 0.8237
Epoch 4/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6239 - loss: 0.8464 - val_accuracy: 0.6264 - val_loss: 0.8167
Epoch 5/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6387 - loss: 0.8353 - val_accuracy: 0.6316 - val_loss: 0.8175
Epoch 6/10
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.6421 - loss: 0.8117 - val_accuracy: 0.6326 - val_loss: 0.8092
Epoch 7/10
[1m122/122[0m

# Sentiment Model Results Summary
We have trained two models for financial news sentiment analysis:

Logistic Regression with TF-IDF
*  Accuracy: 72.86%


* Best Performance on Neutral Sentiment (Precision: 72%, Recall: 94%)
* Struggles with Negative and Positive Sentiments

Neural Network with Word2Vec
* Final Accuracy: 64.50% (Validation), 63.38% (Test)
* Gradual Improvement Across Epochs but still underperforming

**Key Observations**
* Logistic Regression with TF-IDF performs better than the Neural Network on this dataset.
*  Neural Network struggles to generalize (likely due to limited training data or suboptimal hyperparameters).
* Imbalanced class distribution – the model is biased toward the neutral class (due to its high support count).