In [2]:
pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [3]:
#  Step 1: Import Libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
import optuna  # For hyperparameter tuning
import warnings
warnings.filterwarnings("ignore")

#  Step 2: Load Dataset
df = pd.read_csv("output_chunk_4.csv")
df["text"] = df["text"].astype(str)  # Ensure text is string

#  Step 3: Data Preprocessing
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download('punkt_tab') # Download the punkt_tab resource

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-z\s]", "", text)  # Remove special characters
    tokens = word_tokenize(text)  # Tokenization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatization & Stopword removal
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(preprocess_text)

#  Step 4: Train-Test Split
X = df["clean_text"]
y = df["category"]

# Split into Training (80%) and Temporary set (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Split Temporary set into Validation (18%) and Test (2%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.1, stratify=y_temp, random_state=42)

#  Step 5: Feature Extraction (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

#  Step 6: Train and Evaluate Best Model

# Train SVM Model
svm = LinearSVC(random_state=42)
svm.fit(X_train_tfidf, y_train)

# Predict on Validation Set
y_val_pred = svm.predict(X_val_tfidf)

# Compute Accuracy and Full Classification Report for Each Category
validation_accuracy = accuracy_score(y_val, y_val_pred)
classification_results = classification_report(y_val, y_val_pred, digits=3)

print(f"\nValidation Accuracy: {validation_accuracy:.3f}")
print("\nValidation Classification Report:")
print(classification_results)

#  Step 7: Final Testing

# Predict on Test Set
y_test_pred = svm.predict(X_test_tfidf)

# Compute Accuracy and Full Classification Report for Each Category
test_accuracy = accuracy_score(y_test, y_test_pred)
test_results = classification_report(y_test, y_test_pred, digits=3)

print(f"\nTest Accuracy: {test_accuracy:.3f}")
print("\nFinal Test Classification Report:")
print(test_results)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



Validation Accuracy: 0.810

Validation Classification Report:
               precision    recall  f1-score   support

         arts      0.795     0.861     0.827        72
        crime      0.810     0.889     0.848        72
     disaster      0.785     0.861     0.821        72
      economy      0.760     0.792     0.776        72
    education      0.931     0.931     0.931        72
environmental      0.942     0.903     0.922        72
       health      0.867     0.903     0.884        72
humanInterest      0.767     0.778     0.772        72
       labour      0.845     0.833     0.839        72
    lifestyle      0.792     0.847     0.819        72
        other      0.295     0.181     0.224        72
     politics      0.955     0.889     0.921        72
     religion      0.678     0.556     0.611        72
      science      0.835     0.917     0.874        72
       social      0.919     0.792     0.851        72
        sport      0.776     0.917     0.841        72
 

In [4]:
from collections import Counter

# Function to extract top N words per category
def get_top_words_by_category(df, category_column, text_column, top_n=10):
    category_word_freq = {}

    for category in df[category_column].unique():
        text = " ".join(df[df[category_column] == category][text_column])
        words = text.split()
        word_counts = Counter(words)
        top_words = [word for word, _ in word_counts.most_common(top_n)]
        category_word_freq[category] = top_words

    return category_word_freq

# Extract top words per category
top_words_by_category = get_top_words_by_category(df, "category", "clean_text")

# Print the most frequent words per category
for category, words in top_words_by_category.items():
    print(f"{category}: {words}")


arts: ['music', 'said', 'year', 'group', 'new', 'one', 'time', 'people', 'june', 'like']
crime: ['said', 'court', 'crime', 'war', 'fraud', 'year', 'corruption', 'state', 'international', 'also']
disaster: ['said', 'police', 'year', 'car', 'vehicle', 'road', 'driver', 'crash', 'man', 'june']
economy: ['market', 'year', 'said', 'price', 'rate', 'also', 'new', 'company', 'inflation', 'food']
education: ['university', 'student', 'degree', 'said', 'year', 'college', 'education', 'state', 'program', 'also']
environmental: ['waste', 'recycling', 'plastic', 'said', 'energy', 'market', 'year', 'management', 'new', 'company']
health: ['said', 'health', 'patient', 'care', 'covid', 'therapy', 'case', 'new', 'infection', 'people']
humanInterest: ['size', 'win', 'acc', 'ownership', 'statement', 'kb', 'act', 'beneficial', 'ceremony', 'said']
labour: ['health', 'work', 'security', 'job', 'said', 'employee', 'people', 'year', 'also', 'service']
lifestyle: ['home', 'life', 'said', 'time', 'one', 'year',

In [5]:
import numpy as np

# Function to create rule-based features
def create_rule_based_features(text, top_words_by_category):
    feature_vector = np.zeros(len(top_words_by_category))  # One feature per category
    words = set(text.split())  # Unique words in the text

    for idx, (category, top_words) in enumerate(top_words_by_category.items()):
        if any(word in words for word in top_words):  # If a top word is present
            feature_vector[idx] = 1  # Mark feature as present

    return feature_vector

# Apply the function to create features
rule_based_features = np.array([create_rule_based_features(text, top_words_by_category) for text in df["clean_text"]])


In [13]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# 1. Standardize Rule-Based Features (Important for SVM)
# Apply the function to create features for the training set
rule_based_features_train = np.array([create_rule_based_features(text, top_words_by_category) for text in X_train])
rule_based_features_val = np.array([create_rule_based_features(text, top_words_by_category) for text in X_val])
rule_based_features_test = np.array([create_rule_based_features(text, top_words_by_category) for text in X_test])

scaler = StandardScaler()
rule_based_features_scaled_train = scaler.fit_transform(rule_based_features_train)
rule_based_features_scaled_val = scaler.transform(rule_based_features_val) # Use transform, not fit_transform for val and test
rule_based_features_scaled_test = scaler.transform(rule_based_features_test)

# 2. Combine Features (TF-IDF + Rule-Based)
X_train_combined = np.hstack([X_train_tfidf.toarray(), rule_based_features_scaled_train])
X_val_combined = np.hstack([X_val_tfidf.toarray(), rule_based_features_scaled_val])
X_test_combined = np.hstack([X_test_tfidf.toarray(), rule_based_features_scaled_test])


# 3. Train SVM Model
svm_model = SVC(kernel='linear', random_state=42)  # You can try other kernels like 'rbf'
svm_model.fit(X_train_combined, y_train)

# 4. Predict and Evaluate on Validation Set
y_val_pred = svm_model.predict(X_val_combined)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.3f}")
print(classification_report(y_val, y_val_pred))

# 5. Predict and Evaluate on Test Set
y_test_pred = svm_model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy:.3f}")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.757
               precision    recall  f1-score   support

         arts       0.74      0.86      0.79        72
        crime       0.78      0.81      0.79        72
     disaster       0.71      0.83      0.77        72
      economy       0.70      0.76      0.73        72
    education       0.85      0.85      0.85        72
environmental       0.95      0.83      0.89        72
       health       0.89      0.81      0.85        72
humanInterest       0.71      0.69      0.70        72
       labour       0.83      0.82      0.83        72
    lifestyle       0.80      0.79      0.80        72
        other       0.25      0.29      0.27        72
     politics       0.87      0.82      0.84        72
     religion       0.65      0.56      0.60        72
      science       0.75      0.75      0.75        72
       social       0.91      0.74      0.82        72
        sport       0.72      0.83      0.77        72
       unrest       0.78      0.68   

In [15]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# 1. Create and standardize rule-based features for train, val, and test sets:

# a. Apply the create_rule_based_features function to each set:
rule_based_features_train = np.array([create_rule_based_features(text, top_words_by_category) for text in X_train])
rule_based_features_val = np.array([create_rule_based_features(text, top_words_by_category) for text in X_val])
rule_based_features_test = np.array([create_rule_based_features(text, top_words_by_category) for text in X_test])

# b. Standardize the rule-based features:
scaler = StandardScaler()
rule_based_features_scaled_train = scaler.fit_transform(rule_based_features_train)
rule_based_features_scaled_val = scaler.transform(rule_based_features_val)  # Use transform, not fit_transform for val and test
rule_based_features_scaled_test = scaler.transform(rule_based_features_test)

# 2. Combine Features (TF-IDF + Rule-Based):
X_train_combined = np.hstack([X_train_tfidf.toarray(), rule_based_features_scaled_train])
X_val_combined = np.hstack([X_val_tfidf.toarray(), rule_based_features_scaled_val])
X_test_combined = np.hstack([X_test_tfidf.toarray(), rule_based_features_scaled_test])

# 3. Train SVM Model:
svm_model = SVC(kernel='linear', random_state=42)  # You can try other kernels like 'rbf'
svm_model.fit(X_train_combined, y_train)

# 4. Predict and Evaluate on Validation Set:
y_val_pred = svm_model.predict(X_val_combined)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.3f}")
print(classification_report(y_val, y_val_pred))

# 5. Predict and Evaluate on Test Set:
y_test_pred = svm_model.predict(X_test_combined)
accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {accuracy:.3f}")
print(classification_report(y_test, y_test_pred))

Validation Accuracy: 0.757
               precision    recall  f1-score   support

         arts       0.74      0.86      0.79        72
        crime       0.78      0.81      0.79        72
     disaster       0.71      0.83      0.77        72
      economy       0.70      0.76      0.73        72
    education       0.85      0.85      0.85        72
environmental       0.95      0.83      0.89        72
       health       0.89      0.81      0.85        72
humanInterest       0.71      0.69      0.70        72
       labour       0.83      0.82      0.83        72
    lifestyle       0.80      0.79      0.80        72
        other       0.25      0.29      0.27        72
     politics       0.87      0.82      0.84        72
     religion       0.65      0.56      0.60        72
      science       0.75      0.75      0.75        72
       social       0.91      0.74      0.82        72
        sport       0.72      0.83      0.77        72
       unrest       0.78      0.68   

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Download NLTK dependencies
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("output_chunk_4.csv")  # Update path

# Split data
X = df["text"]
y = df["category"]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.1, stratify=y_temp, random_state=42)

# -----------------------------------------------
# Function to preprocess text
# -----------------------------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

X_train = X_train.apply(preprocess_text)
X_val = X_val.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

# -----------------------------------------------
# 1. TF-IDF Embeddings
# -----------------------------------------------
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# -----------------------------------------------
# 2. RoBERTa Embeddings
# -----------------------------------------------
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_model = AutoModel.from_pretrained("roberta-base")

def get_roberta_embedding(text):
    inputs = roberta_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = roberta_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

X_train_roberta = np.array([get_roberta_embedding(text) for text in tqdm(X_train.tolist())])
X_val_roberta = np.array([get_roberta_embedding(text) for text in tqdm(X_val.tolist())])
X_test_roberta = np.array([get_roberta_embedding(text) for text in tqdm(X_test.tolist())])

# -----------------------------------------------
# 3. DistilBERT Embeddings
# -----------------------------------------------
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")

def get_distilbert_embedding(text):
    inputs = distilbert_tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = distilbert_model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

X_train_distilbert = np.array([get_distilbert_embedding(text) for text in tqdm(X_train.tolist())])
X_val_distilbert = np.array([get_distilbert_embedding(text) for text in tqdm(X_val.tolist())])
X_test_distilbert = np.array([get_distilbert_embedding(text) for text in tqdm(X_test.tolist())])






[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


FileNotFoundError: [Errno 2] No such file or directory: 'output_chunk_4.csv'

In [None]:
# -----------------------------------------------
# Stacked Classifier (SVM + XGBoost + Logistic Regression)
# -----------------------------------------------
base_models = [
    ("svm", SVC(probability=True, random_state=42)),
    ("xgb", XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42)),
    ("lr", LogisticRegression(max_iter=500, random_state=42))
]

# Final Stacking Model
stacked_model = StackingClassifier(estimators=base_models, final_estimator=XGBClassifier())

# -----------------------------------------------
# GridSearchCV for Hyperparameter Optimization
# -----------------------------------------------
param_grid = {
    "final_estimator__n_estimators": [100, 200, 300],
    "final_estimator__max_depth": [6, 10, 14],
    "final_estimator__learning_rate": [0.01, 0.05, 0.1]
}

def train_evaluate_model(X_train_emb, X_val_emb, name):
    print(f"\nTraining Stacked Model with {name} embeddings...")

    grid_search = GridSearchCV(stacked_model, param_grid, cv=3, scoring="accuracy", verbose=2, n_jobs=-1)
    grid_search.fit(X_train_emb, y_train)

    best_model = grid_search.best_estimator_

    y_val_pred = best_model.predict(X_val_emb)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    print(f"\nBest Parameters for {name}: {grid_search.best_params_}")
    print(f"Validation Accuracy ({name}): {val_accuracy:.3f}")
    print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))

# Train and Evaluate for each embedding type
train_evaluate_model(X_train_tfidf, X_val_tfidf, "TF-IDF")
train_evaluate_model(X_train_roberta, X_val_roberta, "RoBERTa")
train_evaluate_model(X_train_distilbert, X_val_distilbert, "DistilBERT")

In [None]:
# -----------------------------------------------
# Final Evaluation on Test Set
# -----------------------------------------------
def test_final_model(X_train_emb, X_val_emb, X_test_emb, name):
    print(f"\nFinal Testing with {name} embeddings...")

    stacked_model.fit(X_train_emb, y_train)
    y_test_pred = stacked_model.predict(X_test_emb)

    test_accuracy = accuracy_score(y_test, y_test_pred)
    print(f"\nFinal Test Accuracy ({name}): {test_accuracy:.3f}")
    print("\nFinal Test Classification Report:\n", classification_report(y_test, y_test_pred))

# Test best model on test set for all embeddings
test_final_model(X_train_tfidf, X_val_tfidf, X_test_tfidf, "TF-IDF")
test_final_model(X_train_roberta, X_val_roberta, X_test_roberta, "RoBERTa")
test_final_model(X_train_distilbert, X_val_distilbert, X_test_distilbert, "DistilBERT")

NameError: name 'X_train_tfidf' is not defined

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import classification_report, accuracy_score
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

# Download NLTK dependencies
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
df = pd.read_csv("output_chunk_4.csv")  # Update path

# Split data
X = df["text"]
y = df["category"]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.1, stratify=y_temp, random_state=42)

print(f"Dataset Loaded. Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Dataset Loaded. Train: 5760, Val: 1296, Test: 144


In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

X_train = X_train.apply(preprocess_text)
X_val = X_val.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

print("Text Preprocessing Complete ")


Text Preprocessing Complete 


In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 3))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print(f"TF-IDF Embeddings Shape: {X_train_tfidf.shape}")


TF-IDF Embeddings Shape: (5760, 5000)


In [17]:
# save TF-IDF Embedding sepratelsy to use later
import joblib

joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

# show path of saved file
print(f"TF-IDF Vectorizer saved to: tfidf_vectorizer.joblib")

TF-IDF Vectorizer saved to: tfidf_vectorizer.joblib


In [20]:
pip install AutoTokenizer

[31mERROR: Could not find a version that satisfies the requirement AutoTokenizer (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for AutoTokenizer[0m[31m
[0m

In [21]:

import AutoTokenizer


roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_model = AutoModel.from_pretrained("roberta-base")

def batch_roberta_embeddings(text_list, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Extracting RoBERTa Embeddings"):
        batch_texts = text_list[i:i+batch_size]
        inputs = roberta_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = roberta_model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

X_train_roberta = batch_roberta_embeddings(X_train.tolist())
X_val_roberta = batch_roberta_embeddings(X_val.tolist())
X_test_roberta = batch_roberta_embeddings(X_test.tolist())

print(f"RoBERTa Embeddings Shape: {X_train_roberta.shape}")


ModuleNotFoundError: No module named 'AutoTokenizer'

In [22]:
distilbert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = AutoModel.from_pretrained("distilbert-base-uncased")

def batch_distilbert_embeddings(text_list, batch_size=32):
    all_embeddings = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Extracting DistilBERT Embeddings"):
        batch_texts = text_list[i:i+batch_size]
        inputs = distilbert_tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = distilbert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

X_train_distilbert = batch_distilbert_embeddings(X_train.tolist())
X_val_distilbert = batch_distilbert_embeddings(X_val.tolist())
X_test_distilbert = batch_distilbert_embeddings(X_test.tolist())

print(f"DistilBERT Embeddings Shape: {X_train_distilbert.shape}")


NameError: name 'AutoTokenizer' is not defined

In [None]:
svm_model = SVC(kernel="linear", probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train)

y_val_pred = svm_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"SVM Validation Accuracy: {val_accuracy:.3f}")
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))


SVM Validation Accuracy: 0.762

Validation Classification Report:
                precision    recall  f1-score   support

         arts       0.74      0.88      0.80        72
        crime       0.82      0.75      0.78        72
     disaster       0.76      0.81      0.78        72
      economy       0.70      0.74      0.72        72
    education       0.91      0.89      0.90        72
environmental       0.94      0.82      0.87        72
       health       0.87      0.82      0.84        72
humanInterest       0.73      0.67      0.70        72
       labour       0.85      0.83      0.84        72
    lifestyle       0.72      0.85      0.78        72
        other       0.25      0.31      0.28        72
     politics       0.92      0.83      0.88        72
     religion       0.58      0.56      0.57        72
      science       0.80      0.82      0.81        72
       social       0.92      0.67      0.77        72
        sport       0.75      0.88      0.81        

In [None]:
base_models = [
    ("svm", SVC(kernel="linear", probability=True, random_state=42)),
    ("xgb", XGBClassifier(n_estimators=200, max_depth=10, learning_rate=0.1, random_state=42)),
    ("lr", LogisticRegression(max_iter=500, random_state=42))
]

stacked_model = StackingClassifier(estimators=base_models, final_estimator=SVC(kernel="linear", probability=True))

stacked_model.fit(X_train_tfidf, y_train)

y_val_pred = stacked_model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)

print(f"Stacked Model Validation Accuracy: {val_accuracy:.3f}")
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))


KeyboardInterrupt: 

In [None]:
y_test_pred = stacked_model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Final Test Accuracy: {test_accuracy:.3f}")
print("\nFinal Test Classification Report:\n", classification_report(y_test, y_test_pred))
