<a href="https://colab.research.google.com/github/nurhadfina/Nurhadfina-Github/blob/main/CPC353_ASSIGNMENT2_G32.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ASSIGNMENT 2: Natural Language Processing Semester I 2025/2026
Title: Stock trend prediction with news sentiment

Objectives

O1: Construct natural language processing and deep learning components in problems
involving prediction, classification and sequence modeling in text and speech.

O2: Design solutions using natural language processing and deep learning techniques for
problems in text and speech analytics.

  TASK 1 : Data Loading, Labeling, and Splitting

In [None]:
import pandas as pd

# 1. Load the dataset
df = pd.read_csv('stock_trend.csv')

print("Step 1: Dataset Loaded ")
print(df.head(10))
print("\n")

# 2. Labeling the Data
# Calculate Relative Change (%) = ((After - Before) / Before) * 100
df['Relative_Change'] = ((df['After'] - df['Before']) / df['Before']) * 100

# Function to define trend based on relative change
def define_trend(change):
    if change > 10:
        return 'uptrend'
    elif change < -10:
        return 'downtrend'
    else:
        return 'flat'

# Apply the function to create the 'Label' column
df['Label'] = df['Relative_Change'].apply(define_trend)

print("Step 2: Data Labeled (First 10 lines)")
# Displaying relevant columns to verify the logic
print(df[['Name', 'Before', 'After', 'Relative_Change', 'Label']].head(10))
print("\n")

# 3. Splitting the Data
# 70% Training, 20% Validation, 10% Test
train_size = int(0.7 * len(df))
val_size = int(0.2 * len(df))

# Slicing the dataframe
train_data = df.iloc[:train_size]
val_data = df.iloc[train_size : train_size + val_size]
test_data = df.iloc[train_size + val_size:]

print(f"Step 3: Data Split")
print(f"Total Records: {len(df)}")
print(f"Training Set: {len(train_data)} records (70%)")
print(f"Validation Set: {len(val_data)} records (20%)")
print(f"Test Set: {len(test_data)} records (10%)")

print("Training Data Sample (First 10 lines)")
print(train_data[['Name', 'Label']].head(10))

Step 1: Dataset Loaded 
                                               Title  \
0  100 startups participate in Maxis' Market Acce...   
1      16.89% stake in Subur Tiasa traded off-market   
2  Najib wanted 1MDB's Genting Sanyen deal sped u...   
3        25bps OPR cut likely in 2H20, says Manulife   
4  A 25-month extension on concession pushes Phar...   
5             3.7% of Yong Tai transacted off-market   
6              3A, Ruberex, Thriven, Kanger, UniWall   
7      40% stake in IWH-CREC may cost Ekovest RM1.5b   
8      4.41% Kronologi Asia shares traded off-market   
9                    4.96% of MMAG traded off market   

                        Time     Name  Quote  Before  After  
0  2019-12-12T23:50:12+08:00    MAXIS   6012   5.160  5.110  
1  2020-02-20T22:41:12+08:00    SUBUR   6904   0.610  0.610  
2  2020-07-16T17:42:30+08:00  GENTING   3182   4.080  4.060  
3  2020-01-16T17:03:43+08:00  MANULFE   1058   2.420  2.420  
4  2019-11-11T10:49:58+08:00   PHARMA   7081   2.

TASK 2

Loading GloVe Word Embeddings

In [None]:
import os
import numpy as np

# Download the GloVe embedding file if not already present
glove_path = 'glove.6B.100d.txt'
if not os.path.exists(glove_path):
   print("Downloading GloVe embeddings...")
   !wget http://nlp.stanford.edu/data/glove.6B.zip
   !unzip glove.6B.zip
   print("GloVe embeddings downloaded.")

embeddings_index = {}
with open(glove_path, encoding='utf8') as f:
  for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs

print(f"Found {len(embeddings_index)} word vectors.")

Found 73261 word vectors.


Text Preprocessing & Tokenization

1.   List item
2.   List item



LSTM Model Architecture

In [None]:
import numpy as np
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Flatten, Embedding
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.utils.class_weight import compute_class_weight

sent_length = 100   # Adjust based on headline lengths
n_features = 100    # Because use glove.6B.100d.txt
n_output = 3        # Three classes: uptrend, flat, downtrend

# Define vocab_size before its first use
vocab_size = 10000 # Example value, adjust as needed based on your dataset's vocabulary or desired maximum

# --- GLOVE EMBEDDING & OOV HANDLING ---
train_data['Title'] = train_data['Title'].str.lower().str.replace(r'[^A-Za-z0-9 ]','') # Fix SyntaxWarning and remove all non-alphanumeric and non-space characters
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['Title'])
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding/OOV
embedding_matrix = np.zeros((vocab_size, n_features))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Add a check to ensure embedding_vector has the expected number of features
            if len(embedding_vector) == n_features:
                embedding_matrix[i] = embedding_vector
            # else: the embedding_matrix[i] will remain zeros, treating it as an OOV
        # OOV words (or words with mismatched dimensions) remain as zeros

# --- Preprocessing Data for Model Training ---
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_data['Title'])
val_sequences = tokenizer.texts_to_sequences(val_data['Title'])
test_sequences = tokenizer.texts_to_sequences(test_data['Title'])

# Pad sequences
train_padded = sequence.pad_sequences(train_sequences, maxlen=sent_length)
val_padded = sequence.pad_sequences(val_sequences, maxlen=sent_length)
test_padded = sequence.pad_sequences(test_sequences, maxlen=sent_length)

# Map labels to numerical values
label_mapping = {'uptrend': 0, 'flat': 1, 'downtrend': 2}
train_labels = train_data['Label'].map(label_mapping).values
val_labels = val_data['Label'].map(label_mapping).values
test_labels = test_data['Label'].map(label_mapping).values

# Calculate class weights for imbalanced datasets
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(train_labels), y=train_labels)
dict_weights = dict(enumerate(class_weights))

print("--- Step 4: Data Preprocessed and Labels Encoded ---")
print(f"Shape of train_padded: {train_padded.shape}")
print(f"Shape of val_padded: {val_padded.shape}")
print(f"Shape of test_padded: {test_padded.shape}")
print(f"Sample train_labels (first 10): {train_labels[:10]}")
print(f"Class weights: {dict_weights}")
print("\n")

# --- MODEL ARCHITECTURE ---
# Using the Embedding layer as the first layer
inputs = Input(shape=(sent_length,))
# We pass word indices into an Embedding layer instead of passing raw vectors
embed = Embedding(vocab_size, n_features, weights=[embedding_matrix],
                  trainable=False)(inputs)

# LSTM setup identical to your lab style
lstm = LSTM(64, return_sequences=False)
outputs_seq = lstm(embed)

flat = Flatten()(outputs_seq)
outputs = Dense(n_output, activation='softmax')(flat)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

Model Training

In [None]:
# Updated fit method for your assignment
model.fit(train_padded, to_categorical(train_labels),
          validation_data=(val_padded, to_categorical(val_labels)),
          epochs=20,
          batch_size=4,
          class_weight=dict_weights) # Critical for fixing the 0.00 recall

Confusion Matrix Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# 1. Get predictions (indices 0, 1, or 2)
y_pred_probs = model.predict(test_padded)
y_pred = np.argmax(y_pred_probs, axis=1)

# 2. Create Confusion Matrix
cm = confusion_matrix(test_labels, y_pred)
classes = ['uptrend', 'flat', 'downtrend']

# 3. Plot
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=classes, yticklabels=classes)
plt.title('Stock Trend Prediction: Actual vs Predicted')
plt.ylabel('Actual Trend')
plt.xlabel('Predicted Trend')
plt.show()

Performance Evaluation

In [None]:
from sklearn.metrics import classification_report, accuracy_score

# 1. Get the predicted probabilities
y_pred_probs = model.predict(test_padded)

# 2. Convert probabilities to class indices (0, 1, or 2)
y_pred = np.argmax(y_pred_probs, axis=1)

# 3. Calculate and Print Metrics (Requirement 2.6)
# Ensure test_labels are the integer-encoded versions of 'uptrend', 'flat', 'downtrend'
print("--- Evaluation Metrics ---")
print(classification_report(test_labels, y_pred, target_names=['uptrend', 'flat', 'downtrend']))

# Print 10 lines of predictions as required by substep instructions
print("\n--- Top 10 Test Predictions ---")
for i in range(10):
    print(f"Actual: {test_labels[i]}, Predicted: {y_pred[i]}")

In [None]:
# Create a more detailed Top 10 for the report
results_df = pd.DataFrame({
    'News Headline': test_data['Title'].iloc[:10].values,
    'Actual Label': [classes[i] for i in test_labels[:10]],
    'Predicted Label': [classes[i] for i in y_pred[:10]]
})

print("--- Sample Predictions ---")
print(results_df)

TASK 3

In [None]:
# If running on Google Colab, uncomment these:
!pip -q install transformers datasets accelerate scikit-learn pandas

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed
)

set_seed(42)

Tokenize with a pretrained Transformer tokenizer

In [None]:
from datasets import Dataset

model_ckpt = "distilbert-base-uncased"  # lightweight, good baseline
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

train_ds = Dataset.from_pandas(train_df[["text", "label"]])
val_ds   = Dataset.from_pandas(val_df[["text", "label"]])
test_ds  = Dataset.from_pandas(test_df[["text", "label"]])

counts = train_df["label"].value_counts().sort_index()
print("Train label counts:", counts.to_dict())

# Inverse-frequency weights (common baseline)
weights = 1.0 / counts.values
weights = weights / weights.sum() * len(counts)   # normalize (optional but nice)
class_weights = torch.tensor(weights, dtype=torch.float)

print("Class weights (downtrend, flat, uptrend):", class_weights.tolist())

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, max_length = 128)

train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_ds


Define model + metrics

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)

    return {
        "accuracy": acc,
        "precision_macro": p,
        "recall_macro": r,
        "f1_macro": f1
    }


 Fine-tune (training + validation)

In [None]:
training_args = TrainingArguments(
    output_dir="transformer_stock_trend",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    logging_steps=200,
    report_to="none"
)

import torch.nn as nn
from transformers import Trainer

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        loss_fct = nn.CrossEntropyLoss(weight=class_weights.to(logits.device))
        loss = loss_fct(logits, labels)

        return (loss, outputs) if return_outputs else loss



trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,            # you can keep it; warning is fine
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()

Evaluate on validation + test

In [None]:
print("Best checkpoint metrics on VALIDATION:")
val_metrics = trainer.evaluate(eval_dataset=val_ds)
val_metrics

In [None]:
# Predict on TEST
test_pred = trainer.predict(test_ds)
test_logits = test_pred.predictions
test_labels = test_pred.label_ids
test_preds = np.argmax(test_logits, axis=-1)

print("TEST metrics (macro):")
acc = accuracy_score(test_labels, test_preds)
p, r, f1, _ = precision_recall_fscore_support(test_labels, test_preds, average="macro", zero_division=0)
print({"accuracy": acc, "precision_macro": p, "recall_macro": r, "f1_macro": f1})

print("\nClassification report:")
print(classification_report(test_labels, test_preds, target_names=[id2label[i] for i in sorted(id2label.keys())], zero_division=0))

print("Confusion matrix (rows=true, cols=pred):")
confusion_matrix(test_labels, test_preds)