########Importing necessary libraries for using different models of Machine Learning

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Loading the dataset

In [2]:
file_path = r"D:\Zishan\Projects\SentiNewzzz\FinancialMarketNews.csv"
df = pd.read_csv(file_path, encoding="ISO-8859-1")


Data Preprocessing

In [3]:
# Here we are Merging all news columns into a single text column
news_columns = [col for col in df.columns if "News" in col]
df[news_columns] = df[news_columns].fillna("")
df["Combined_News"] = df[news_columns].apply(lambda x: " ".join(x), axis=1)
df = df[["Combined_News", "Label"]]

# Tokenization is done in this step
MAX_VOCAB_SIZE = 5000
MAX_SEQUENCE_LENGTH = 250
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Combined_News"])

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df["Combined_News"])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")
labels = np.array(df["Label"])

Standardizing and Train Test Split

In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42, stratify=labels)

# Standardize features for models that benefit from it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Creating Different Models

In [7]:

models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(),
}


Training All the Models and Printing each's accuracy score

In [8]:

accuracy_scores = {}
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Printing all accuracies
print("\nFinal Accuracy Scores:")
for model, acc in accuracy_scores.items():
    print(f"{model}: {acc:.4f}")


Logistic Regression Accuracy: 0.4892
Decision Tree Accuracy: 0.5072
Random Forest Accuracy: 0.5179
SVM Accuracy: 0.5084

Final Accuracy Scores:
Logistic Regression: 0.4892
Decision Tree: 0.5072
Random Forest: 0.5179
SVM: 0.5084
