## PROJECT-NLP Pipeline-Group 48

In [2]:
# Project
#Necessary Imports & Libraries
#!pip install simpletransformers
#!pip install tensorflow
#!pip install tensorflow-addons

import os
import torch
import spacy
import joblib
import numpy as np
import pandas as pd
import tensorflow as tf
from torch.utils.data import Dataset
tf.config.run_functions_eagerly(True)
from tensorflow.keras.models import Model
from scipy.sparse import hstack, csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Layer
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, TimeDistributed, Dense, Activation




## A.1) Data Pre-processing(NERC)
- Sentence Grouping: Converts flat token-tag data into grouped sentences.
- Index Mapping: Builds vocab and tag dictionaries with PAD and UNK handling.
- Padding & One-Hot: Pads sequences to uniform length and one-hot encodes tags for training.

In [3]:
df = pd.read_csv("NER_train_generated.tsv", sep="\t")
# Fill missing sentence IDs (if any)
df["sentence_id"] = df["sentence_id"].ffill()

# Group into sentence-token-tag triples
sentences = []
for _, group in df.groupby("sentence_id"):
    sentence = list(zip(group["token"].values, group["BIO_NER_tag"].values))
    sentences.append(sentence)

# Build vocab and tag mappings
words = list(set(w for s in sentences for w, _ in s))
tags = list(set(t for s in sentences for _, t in s))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0

idx2tag = {i: t for t, i in tag2idx.items()}

# Convert sentences and tags to index format
X = [[word2idx.get(w, word2idx["UNK"]) for w, _ in s] for s in sentences]
y = [[tag2idx[t] for _, t in s] for s in sentences]

# Pad sequences to the same length
max_len = 100
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# One-hot encode tag sequences
y_cat = [to_categorical(i, num_classes=len(tag2idx)) for i in y]


### A.2) Named Entity Recognition (NER) with BiLSTM + SOFtMax
- BiLSTM + Softmax: Predicts NER tags per token using bidirectional LSTM and TimeDistributed softmax.

- One-hot Tagging: Pads sequences and uses one-hot encoding for BIO labels.

- No CRF: Outputs tags independently without sequence constraints.

In [4]:

# Load the NER TSV data
df = pd.read_csv("NER_train_generated.tsv", sep="\t")
df["sentence_id"] = df["sentence_id"].ffill()

# Group into sentences
sentences = []
for _, group in df.groupby("sentence_id"):
    sentence = list(zip(group["token"].values, group["BIO_NER_tag"].values))
    sentences.append(sentence)

# Create vocabulary and tag mappings
words = list(set(w for s in sentences for w, _ in s))
tags = list(set(t for s in sentences for _, t in s))

word2idx = {w: i + 2 for i, w in enumerate(words)}
word2idx["PAD"] = 0
word2idx["UNK"] = 1

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
idx2tag = {i: t for t, i in tag2idx.items()}

# Convert tokens/tags to IDs
X = [[word2idx.get(w, word2idx["UNK"]) for w, _ in s] for s in sentences]
y = [[tag2idx[t] for _, t in s] for s in sentences]

# Pad sequences
max_len = 100
X = pad_sequences(X, maxlen=max_len, padding='post')
y = pad_sequences(y, maxlen=max_len, padding='post')

# Replace OOV tokens
X = np.where(X >= len(word2idx), word2idx["UNK"], X)

# One-hot encode tags
y = np.array([to_categorical(seq, num_classes=len(tag2idx)) for seq in y])

# Define model architecture
input_layer = Input(shape=(max_len,))
x = Embedding(input_dim=len(word2idx), output_dim=64, input_length=max_len, mask_zero=True)(input_layer)
x = Bidirectional(LSTM(units=64, return_sequences=True))(x)
x = TimeDistributed(Dense(len(tag2idx)))(x)
output_layer = Activation('softmax')(x)

model = Model(input_layer, output_layer)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

# Train model
history = model.fit(X, y, batch_size=32, epochs=5, validation_split=0.1)

# Optional: Save model
model.save("ner_bilstm_softmax.h5")

# Save vocab/tag dicts if needed
import joblib
joblib.dump(word2idx, "word2idx.pkl")
joblib.dump(tag2idx, "tag2idx.pkl")




Epoch 1/5




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 3s/step - accuracy: 0.4601 - loss: 2.1848 - val_accuracy: 0.0400 - val_loss: 2.1378
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4s/step - accuracy: 0.0449 - loss: 2.1169 - val_accuracy: 0.0400 - val_loss: 2.0549
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.0449 - loss: 2.0172 - val_accuracy: 0.0400 - val_loss: 1.9245
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 2s/step - accuracy: 0.0435 - loss: 1.8659 - val_accuracy: 0.0400 - val_loss: 1.7241
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3s/step - accuracy: 0.0440 - loss: 1.6281 - val_accuracy: 0.0400 - val_loss: 1.4665




['tag2idx.pkl']

### A.3) NER test Set Evaluation
- Prediction & Padding: Converts test data into indexed, padded format and runs model predictions.

- Label Filtering: Removes PAD tokens and maps predictions/labels back to tag names.

- Evaluation: Uses classification_report and shows sample correct/misclassified tokens.

In [5]:

# Load test data
df_test = pd.read_csv("NER-test.tsv", sep="\t")
df_test["sentence_id"] = df_test["sentence_id"].ffill()

# Group into sentences
test_sentences = []
for _, group in df_test.groupby("sentence_id"):
    sent = list(zip(group["token"].values, group["BIO_NER_tag"].values))
    test_sentences.append(sent)

# Convert test data to indexed format
X_test = [[word2idx.get(w, word2idx["UNK"]) for w, _ in s] for s in test_sentences]
y_true = [[tag2idx.get(t, 0) for _, t in s] for s in test_sentences]

# Pad sequences
X_test = pad_sequences(X_test, maxlen=100, padding='post')
y_true = pad_sequences(y_true, maxlen=100, padding='post')

# Predict
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=-1)

# Flatten and filter PADs
y_true_flat = []
y_pred_flat = []

for i in range(len(y_true)):
    for j in range(len(y_true[i])):
        if y_true[i][j] != 0:  # skip PAD
            y_true_flat.append(idx2tag[y_true[i][j]])
            y_pred_flat.append(idx2tag[y_pred[i][j]])

# Evaluate
print("\n NER Evaluation Report:")
print(classification_report(y_true_flat, y_pred_flat, digits=3))

# Optional: Show some correct and incorrect examples
# Flatten and align tokens only for non-PAD entries
tokens_flat = []
y_true_flat = []
y_pred_flat = []

for i in range(len(test_sentences)):
    for j in range(len(test_sentences[i])):
        true_tag_idx = y_true[i][j]
        if true_tag_idx != 0:  # skip PAD
            token = test_sentences[i][j][0]
            tokens_flat.append(token)
            y_true_flat.append(idx2tag[true_tag_idx])
            y_pred_flat.append(idx2tag[y_pred[i][j]])

comparison_df = pd.DataFrame({
    "Token": tokens_flat,
    "Expected": y_true_flat,
    "Predicted": y_pred_flat
})

print("\n Correct Predictions:")
print(comparison_df[comparison_df["Expected"] == comparison_df["Predicted"]].head(10).to_string(index=False))

print("\n Misclassified Predictions:")
print(comparison_df[comparison_df["Expected"] != comparison_df["Predicted"]].head(10).to_string(index=False))




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step

 NER Evaluation Report:
               precision    recall  f1-score   support

   B-LOCATION      0.000     0.000     0.000         3
        B-ORG      0.000     0.000     0.000         8
     B-PERSON      0.000     0.000     0.000        12
B-WORK_OF_ART      0.000     0.000     0.000         6
   I-LOCATION      0.000     0.000     0.000         2
     I-PERSON      0.000     0.000     0.000        13
I-WORK_OF_ART      0.000     0.000     0.000         8
            O      0.754     1.000     0.859       159

     accuracy                          0.754       211
    macro avg      0.094     0.125     0.107       211
 weighted avg      0.568     0.754     0.648       211


 Correct Predictions:
   Token Expected Predicted
      If        O         O
  you're        O         O
visiting        O         O
       ,        O         O
    make        O         O
    sure        O         O
      to        O      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Expected VS Predicted-NERC Model

In [6]:
# Build DataFrame
comparison_df = pd.DataFrame({
    "Token": tokens_flat,
    "Expected": y_true_flat,
    "Predicted": y_pred_flat
})

# Show full comparison (or first 20 for brevity)
print("\n Expected vs Predicted:")
print(comparison_df.head(20).to_string(index=False))

# Optionally save to file
comparison_df.to_csv("ner_predictions_comparison.tsv", sep="\t", index=False)



 Expected vs Predicted:
   Token      Expected Predicted
      If             O         O
  you're             O         O
visiting             O         O
   Paris    B-LOCATION         O
       ,             O         O
    make             O         O
    sure             O         O
      to             O         O
     see             O         O
     the             O         O
  Louvre         B-ORG         O
       ,             O         O
      as             O         O
    they             O         O
 exhibit             O         O
     the             O         O
    Mona B-WORK_OF_ART         O
    Lisa I-WORK_OF_ART         O
       !             O         O
  Amazon         B-ORG         O


## B.1.1) Sentiment Analysis-VADER- Training Set
- Topic-Specific Models: Trains one logistic regression model per topic (e.g., sports, movie).

- Feature Fusion: Combines TF-IDF with VADER scores and spaCy syntactic features.

- Saved Components: Exports model, vectorizer, and label encoder for each topic separately.

In [None]:
# Load NLP tools
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

# Load and prepare dataset
df = pd.read_csv("sentiment_training_set.csv")
df.dropna(subset=["sentence", "sentiment", "topic"], inplace=True)
df["sentence"] = df["sentence"].str.lower()
df["sentiment"] = df["sentiment"].str.lower()
df["topic"] = df["topic"].str.lower()

# Training per topic
for topic in df["topic"].unique():
    subset = df[df["topic"] == topic]
    texts = subset["sentence"].tolist()
    labels = subset["sentiment"].tolist()

    # Label encode sentiments
    le = LabelEncoder()
    y = le.fit_transform(labels)

    # TF-IDF
    vectorizer = TfidfVectorizer(min_df=1, max_features=5000, ngram_range=(1, 2), analyzer='word')
    X_tfidf = vectorizer.fit_transform(texts)

    # VADER + spaCy features
    vader_scores, adjs, advs, ents, nsubs = [], [], [], [], []
    for text in texts:
        doc = nlp(text)
        vader_scores.append(analyzer.polarity_scores(text)["compound"])
        adjs.append(len([t for t in doc if t.pos_ == "ADJ"]))
        advs.append(len([t for t in doc if t.pos_ == "ADV"]))
        ents.append(len(doc.ents))
        nsubs.append(len([t for t in doc if t.dep_ == "nsubj"]))

    # Combine features
    X_all = hstack([
        X_tfidf,
        csr_matrix(np.array(vader_scores).reshape(-1, 1)),
        csr_matrix(np.array(adjs).reshape(-1, 1)),
        csr_matrix(np.array(advs).reshape(-1, 1)),
        csr_matrix(np.array(ents).reshape(-1, 1)),
        csr_matrix(np.array(nsubs).reshape(-1, 1))
    ])

    # Train model
    clf = LogisticRegression(max_iter=2000, class_weight='balanced')
    clf.fit(X_all, y)

    # Save model, vectorizer, and label encoder
    os.makedirs("models", exist_ok=True)
    joblib.dump(clf, f"models/{topic}_model.pkl")
    joblib.dump(vectorizer, f"models/{topic}_vectorizer.pkl")
    joblib.dump(le, f"models/{topic}_label_encoder.pkl")

    print(f" Model trained and saved for topic: {topic}")


 Model trained and saved for topic: sports
 Model trained and saved for topic: book
 Model trained and saved for topic: movie


## B.1.2)Sentiment Analysis: VADER - Test Set
- Topic-Based Inference: Loads and evaluates a separate model for each topic.

- Rich Features: Uses TF-IDF + VADER + syntactic features for predictions.

- Detailed Evaluation: Outputs per-topic reports, accuracy, and lists correct/misclassified sentences.

In [None]:
# Load NLP tools
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

# Load test data
df_test = pd.read_csv("sentiment-topic-test.tsv", sep="\t")
df_test.dropna(subset=["sentence", "sentiment", "topic"], inplace=True)
df_test["sentence"] = df_test["sentence"].str.lower()
df_test["sentiment"] = df_test["sentiment"].str.lower()
df_test["topic"] = df_test["topic"].str.lower()

# Storage for predictions and results
all_true = []
all_pred = []
all_results = []

# Topic-wise prediction loop
for topic in df_test["topic"].unique():
    subset = df_test[df_test["topic"] == topic]
    texts = subset["sentence"].tolist()
    true_labels = subset["sentiment"].tolist()

    # Load models for this topic
    clf = joblib.load(f"models/{topic}_model.pkl")
    vectorizer = joblib.load(f"models/{topic}_vectorizer.pkl")
    le = joblib.load(f"models/{topic}_label_encoder.pkl")

    # Vectorize
    X_tfidf = vectorizer.transform(texts)

    # Extract features
    vader_scores, adjs, advs, ents, nsubs = [], [], [], [], []
    for text in texts:
        doc = nlp(text)
        vader_scores.append(analyzer.polarity_scores(text)["compound"])
        adjs.append(len([t for t in doc if t.pos_ == "ADJ"]))
        advs.append(len([t for t in doc if t.pos_ == "ADV"]))
        ents.append(len(doc.ents))
        nsubs.append(len([t for t in doc if t.dep_ == "nsubj"]))

    X_all = hstack([
        X_tfidf,
        csr_matrix(np.array(vader_scores).reshape(-1, 1)),
        csr_matrix(np.array(adjs).reshape(-1, 1)),
        csr_matrix(np.array(advs).reshape(-1, 1)),
        csr_matrix(np.array(ents).reshape(-1, 1)),
        csr_matrix(np.array(nsubs).reshape(-1, 1))
    ])

    # Predict
    y_true = le.transform(true_labels)
    y_pred = clf.predict(X_all)
    y_pred_labels = le.inverse_transform(y_pred)

    # Store metrics
    print(f"\n Topic: {topic}")
    print(classification_report(y_true, y_pred, target_names=le.classes_))

    # Store for aggregate report
    all_true.extend(y_true)
    all_pred.extend(y_pred)

    # Store individual results
    result = pd.DataFrame({
        "Topic": topic,
        "Sentence": texts,
        "Expected": true_labels,
        "Predicted": y_pred_labels
    })
    all_results.append(result)

# Combine all topic results
full_df = pd.concat(all_results, ignore_index=True)

# Calculate overall accuracy
print("\n Overall Accuracy:", accuracy_score(all_true, all_pred))

# Show correctly predicted samples only
correct_preds = full_df[full_df["Expected"] == full_df["Predicted"]]
print("\n Correctly Predicted Sentences:")
print(correct_preds.to_string(index=False))

# Show misclassified predictions
errors_df = full_df[full_df["Expected"] != full_df["Predicted"]]
print("\n Misclassified Sentences:")
print(errors_df.to_string(index=False))



 Topic: sports
              precision    recall  f1-score   support

    negative       0.50      1.00      0.67         2
     neutral       0.00      0.00      0.00         2
    positive       1.00      0.50      0.67         2

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.44         6
weighted avg       0.50      0.50      0.44         6


 Topic: book
              precision    recall  f1-score   support

    negative       0.50      0.50      0.50         2
     neutral       0.00      0.00      0.00         2
    positive       0.50      0.50      0.50         2

    accuracy                           0.33         6
   macro avg       0.33      0.33      0.33         6
weighted avg       0.33      0.33      0.33         6


 Topic: movie
              precision    recall  f1-score   support

    negative       1.00      0.50      0.67         2
     neutral       0.50      1.00      0.67         2
    positive       1.00      0

### B.2.1) Sentiment Analysis- Model 2: Sckit Learn-Train Set
- Per-Topic Models: Trains a separate logistic regression model for each topic.

- Hybrid Features: Combines TF-IDF with VADER scores and spaCy linguistic features.

- Exportable Pipeline: Saves model, vectorizer, and label encoder for easy reuse.

In [7]:
# Load training data
df = pd.read_csv("sentiment_training_set.csv")
df.dropna(subset=["sentence", "sentiment", "topic"], inplace=True)
df["sentence"] = df["sentence"].str.lower()
df["sentiment"] = df["sentiment"].str.lower()
df["topic"] = df["topic"].str.lower()

# Load NLP tools
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

# Train one model per topic
for topic in df["topic"].unique():
    topic_df = df[df["topic"] == topic]
    texts = topic_df["sentence"].tolist()
    labels = topic_df["sentiment"].tolist()

    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(labels)

    # TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
    X_tfidf = vectorizer.fit_transform(texts)

    # VADER + spaCy features
    vader_scores, adjs, advs, ents, nsubs = [], [], [], [], []
    for text in texts:
        doc = nlp(text)
        vader_scores.append(analyzer.polarity_scores(text)["compound"])
        adjs.append(len([t for t in doc if t.pos_ == "ADJ"]))
        advs.append(len([t for t in doc if t.pos_ == "ADV"]))
        ents.append(len(doc.ents))
        nsubs.append(len([t for t in doc if t.dep_ == "nsubj"]))

    # Combine features
    X_all = hstack([
        X_tfidf,
        csr_matrix(np.array(vader_scores).reshape(-1, 1)),
        csr_matrix(np.array(adjs).reshape(-1, 1)),
        csr_matrix(np.array(advs).reshape(-1, 1)),
        csr_matrix(np.array(ents).reshape(-1, 1)),
        csr_matrix(np.array(nsubs).reshape(-1, 1)),
    ])

    # Train classifier
    clf = LogisticRegression(max_iter=2000, class_weight='balanced')
    clf.fit(X_all, y)

    # Save model and preprocessing
    # Ensure model directory exists
    os.makedirs("models_sklearn", exist_ok=True)

    # Save model and preprocessing tools
    joblib.dump(clf, f"models_sklearn/{topic}_model.pkl")
    joblib.dump(vectorizer, f"models_sklearn/{topic}_vectorizer.pkl")
    joblib.dump(le, f"models_sklearn/{topic}_label_encoder.pkl")


    print(f" Trained and saved Scikit-Learn model for topic: {topic}")


 Trained and saved Scikit-Learn model for topic: sports
 Trained and saved Scikit-Learn model for topic: book
 Trained and saved Scikit-Learn model for topic: movie


### B.2.2) Sentiment Analysis- Model 2: Sckit Learn-Test Set
- Topic-Based Evaluation: Loads topic-specific models to classify test sentences.

- Feature Consistency: Reuses TF-IDF and VADER + spaCy features for prediction.

- Detailed Output: Prints per-topic metrics and lists correct/misclassified examples.

In [8]:
# Load NLP tools
nlp = spacy.load("en_core_web_sm")
analyzer = SentimentIntensityAnalyzer()

# Load test data
df_test = pd.read_csv("sentiment-topic-test.tsv", sep="\t")
df_test.dropna(subset=["sentence", "sentiment", "topic"], inplace=True)
df_test["sentence"] = df_test["sentence"].str.lower()
df_test["sentiment"] = df_test["sentiment"].str.lower()
df_test["topic"] = df_test["topic"].str.lower()

all_true, all_pred, all_results = [], [], []

for topic in df_test["topic"].unique():
    subset = df_test[df_test["topic"] == topic]
    texts = subset["sentence"].tolist()
    true_labels = subset["sentiment"].tolist()

    # Load model
    clf = joblib.load(f"models_sklearn/{topic}_model.pkl")
    vectorizer = joblib.load(f"models_sklearn/{topic}_vectorizer.pkl")
    le = joblib.load(f"models_sklearn/{topic}_label_encoder.pkl")

    # TF-IDF
    X_tfidf = vectorizer.transform(texts)

    # VADER + spaCy features
    vader_scores, adjs, advs, ents, nsubs = [], [], [], [], []
    for text in texts:
        doc = nlp(text)
        vader_scores.append(analyzer.polarity_scores(text)["compound"])
        adjs.append(len([t for t in doc if t.pos_ == "ADJ"]))
        advs.append(len([t for t in doc if t.pos_ == "ADV"]))
        ents.append(len(doc.ents))
        nsubs.append(len([t for t in doc if t.dep_ == "nsubj"]))

    X_all = hstack([
        X_tfidf,
        csr_matrix(np.array(vader_scores).reshape(-1, 1)),
        csr_matrix(np.array(adjs).reshape(-1, 1)),
        csr_matrix(np.array(advs).reshape(-1, 1)),
        csr_matrix(np.array(ents).reshape(-1, 1)),
        csr_matrix(np.array(nsubs).reshape(-1, 1)),
    ])

    y_true = le.transform(true_labels)
    y_pred = clf.predict(X_all)
    y_pred_labels = le.inverse_transform(y_pred)

    print(f"\n Topic: {topic}")
    print(classification_report(y_true, y_pred, target_names=le.classes_))

    all_true.extend(y_true)
    all_pred.extend(y_pred)

    result = pd.DataFrame({
        "Topic": topic,
        "Sentence": texts,
        "Expected": true_labels,
        "Predicted": y_pred_labels
    })
    all_results.append(result)

# Merge all predictions
full_df = pd.concat(all_results, ignore_index=True)

# Accuracy
print("\n Overall Accuracy:", accuracy_score(all_true, all_pred))

#  Correct predictions
correct_df = full_df[full_df["Expected"] == full_df["Predicted"]]
print("\n Correctly Predicted Sentences:")
print(correct_df.to_string(index=False))

#  Misclassified predictions
errors_df = full_df[full_df["Expected"] != full_df["Predicted"]]
print("\n Misclassified Sentences:")
print(errors_df.to_string(index=False))



 Topic: sports
              precision    recall  f1-score   support

    negative       0.50      1.00      0.67         2
     neutral       0.00      0.00      0.00         2
    positive       1.00      0.50      0.67         2

    accuracy                           0.50         6
   macro avg       0.50      0.50      0.44         6
weighted avg       0.50      0.50      0.44         6


 Topic: book
              precision    recall  f1-score   support

    negative       0.50      0.50      0.50         2
     neutral       0.00      0.00      0.00         2
    positive       0.50      0.50      0.50         2

    accuracy                           0.33         6
   macro avg       0.33      0.33      0.33         6
weighted avg       0.33      0.33      0.33         6


 Topic: movie
              precision    recall  f1-score   support

    negative       1.00      0.50      0.67         2
     neutral       0.50      1.00      0.67         2
    positive       1.00      0

### C.1).Topic Analysis Setup:
- Dataset Source: Loads a subset of the 20 Newsgroups dataset (4 categories).

- Text Cleaning: Removes headers, footers, and quotes to keep raw text only.

- Structured Format: Stores data in pandas DataFrames for easier preprocessing and modeling.

In [None]:
# Load 20 Newsgroups dataset (4 categories)
categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories, random_state=42)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories, random_state=42)

# Prepare train and test DataFrames
df_train = pd.DataFrame({'text': newsgroups_train.data, 'labels': newsgroups_train.target})
df_test = pd.DataFrame({'text': newsgroups_test.data, 'labels': newsgroups_test.target})

### C.2) Topic Analysis-RoBERTa Model
- Rule-Based Labeling: Assigns topics to news headlines using keyword heuristics.

- Balanced Training: Creates a uniform dataset across six topic classes via sampling.

- RoBERTa Fine-Tuning: Trains and evaluates a transformer model with early stopping and stratified validation.

In [None]:
df = pd.read_csv("abcnews-date-text.csv")

# Expanded label rules (now includes 'book', 'movie')
def label_topic(text):
    text = str(text).lower()
    if "sport" in text or "match" in text or "olympic" in text:
        return "sports"
    elif "election" in text or "parliament" in text or "minister" in text or "vote" in text:
        return "politics"
    elif "climate" in text or "weather" in text or "fire" in text or "storm" in text:
        return "environment"
    elif "novel" in text or "author" in text or "book" in text:
        return "book"
    elif "film" in text or "screenplay" in text or "movie" in text or "director" in text:
        return "movie"
    else:
        return "other"

df["label"] = df["headline_text"].apply(label_topic)

# Label encoding
label_list = ['environment', 'other', 'politics', 'sports', 'book', 'movie']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}
df["labels"] = df["label"].map(label2id)

# Balance dataset (equal samples per class)
dfs = []
min_count = 500  # number per class (adjust if needed)

for label in label_list:
    class_df = df[df["label"] == label]
    if len(class_df) > min_count:
        class_df = class_df.sample(n=min_count, random_state=42)
    else:
        class_df = resample(class_df, replace=True, n_samples=min_count, random_state=42)
    dfs.append(class_df)

df_balanced = pd.concat(dfs)
df_model = df_balanced[["headline_text", "labels"]].rename(columns={"headline_text": "text"})

#  Train-test split
train_df, eval_df = train_test_split(df_model, test_size=0.2, stratify=df_model["labels"], random_state=42)

#  Configure RoBERTa
model_args = ClassificationArgs()
model_args.num_train_epochs = 5
model_args.train_batch_size = 32
model_args.learning_rate = 4e-6
model_args.max_seq_length = 256
model_args.evaluate_during_training = True
model_args.use_early_stopping = True
model_args.early_stopping_patience = 3
model_args.overwrite_output_dir = True
model_args.output_dir = "roberta_expanded_model/"
model_args.save_model_every_epoch = False

#  Build and train model
model = ClassificationModel(
    model_type="roberta",
    model_name="roberta-base",
    num_labels=len(label2id),
    args=model_args,
    use_cuda=torch.cuda.is_available()
)

print(" Training RoBERTa on expanded topics...")
model.train_model(train_df, eval_df=eval_df)
print(" Training complete!")

# Evaluate
eval_texts = eval_df["text"].tolist()
true_labels = eval_df["labels"].tolist()
predictions, _ = model.predict(eval_texts)

print(" Classification Report:\n")
print(classification_report(true_labels, predictions, target_names=label_list))


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🚀 Training RoBERTa on expanded topics...


  0%|          | 0/4 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 5:   0%|          | 0/75 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 5:   0%|          | 0/75 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 3 of 5:   0%|          | 0/75 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 4 of 5:   0%|          | 0/75 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 5 of 5:   0%|          | 0/75 [00:00<?, ?it/s]

  with amp.autocast():


  0%|          | 0/1 [00:00<?, ?it/s]

  with amp.autocast():


✅ Training complete!


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  with amp.autocast():


📊 Classification Report:

              precision    recall  f1-score   support

 environment       0.88      0.95      0.91       100
       other       0.62      0.46      0.53       100
    politics       0.86      0.95      0.90       100
      sports       0.85      0.81      0.83       100
        book       0.70      0.73      0.72       100
       movie       0.92      0.99      0.95       100

    accuracy                           0.81       600
   macro avg       0.80      0.81      0.81       600
weighted avg       0.80      0.81      0.81       600



### C.3)RoBERTa Model-Test Set Analysis
- Topic Prediction: Applies a fine-tuned RoBERTa model to classify topics from sentence text.

- Label Mapping: Uses consistent label-to-ID mappings for both prediction and evaluation.

- Evaluation Report: Prints detailed classification metrics and saves predictions to file.

In [None]:

#  Load test file
df = pd.read_csv("sentiment-topic-test.tsv", sep="\t")

#  Normalize and rename columns
df = df.rename(columns={"sentence": "text", "topic": "true_topic"})
df["true_topic"] = df["true_topic"].str.lower().str.strip()

#  Define expanded label list (used during training)
label_list = ['environment', 'other', 'politics', 'sports', 'book', 'movie']
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

#  Load fine-tuned RoBERTa model (trained on all 6 topics)
model = ClassificationModel(
    "roberta",
    "roberta_expanded_model/",
    use_cuda=torch.cuda.is_available()
)

#  Predict topics
texts = df["text"].tolist()
preds, _ = model.predict(texts)
df["predicted_id"] = preds
df["predicted_topic"] = df["predicted_id"].map(id2label)

#  Evaluate only on rows with known topics
df_eval = df[df["true_topic"].isin(label_list)].copy()
df_eval["true_id"] = df_eval["true_topic"].map(label2id)

#  Safely determine all classes present in either ground truth or prediction
present_ids = sorted(list(set(df_eval["true_id"].unique()) | set(df_eval["predicted_id"].unique())))
present_labels = [id2label[i] for i in present_ids]

#  Print classification report
print(" Classification Report (Topics present in test set or predictions):\n")
print(classification_report(
    df_eval["true_id"],
    df_eval["predicted_id"],
    labels=present_ids,
    target_names=present_labels
))

#  Save all predictions
df.to_csv("sentiment_topic_test_with_predictions.csv", index=False)
print(" Saved to: sentiment_topic_test_with_predictions.csv")


0it [00:00, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

📊 Classification Report (Topics present in test set or predictions):

              precision    recall  f1-score   support

       other       0.00      0.00      0.00         0
      sports       1.00      1.00      1.00         6
        book       1.00      0.67      0.80         6
       movie       0.71      0.83      0.77         6

    accuracy                           0.83        18
   macro avg       0.68      0.62      0.64        18
weighted avg       0.90      0.83      0.86        18

✅ Saved to: sentiment_topic_test_with_predictions.csv


  with amp.autocast():
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
