In [1]:
import pandas as pd
import numpy as np
import re
import string
import pickle
import joblib

import nltk
nltk.download("punkt")
nltk.download("stopwords")

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from gensim.models import Word2Vec

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


[nltk_data] Downloading package punkt to C:\Users\Malathi
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Malathi
[nltk_data]     M\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv(
    "C:/Users/Malathi M/OneDrive/Documents/MDTE25/guvi final project/Main project/news.tsv.zip",sep="\t")

df = df.dropna(subset=["Headline", "News body", "Category"])

df["text"] = df["Headline"].astype(str) + " " + df["News body"].astype(str)
df["summary"] = df["Headline"].astype(str)
df = df.rename(columns={"Category": "category"})

# Use only 50% (low compute)
df = df.sample(frac=0.5, random_state=42).reset_index(drop=True)

df.head()


Unnamed: 0,News ID,category,Topic,Headline,News body,Title entity,Entity content,text,summary
0,N95333,news,newsus,This dog's smile will melt your heart,"Mocca lives in Yokohama, Japan, and is a Shiba...",{},{},This dog's smile will melt your heart Mocca li...,This dog's smile will melt your heart
1,N41910,lifestyle,shop-all,The Most Popular Walmart Item in Every State,What are the most oft-ordered Walmart products...,{'Walmart': 'Walmart'},"{'Walmart': {'type': 'item', 'id': 'Q18615334'...",The Most Popular Walmart Item in Every State W...,The Most Popular Walmart Item in Every State
2,N88506,finance,finance-real-estate,Photos: Look Glenn Close's 'Beanfield' estate ...,Emmy winning actress Glen Close listed her Bed...,"{""Glenn Close's"": 'Glenn Close', 'Bedford': 'B...","{'Glenn Close': {'type': 'item', 'id': 'Q37231...",Photos: Look Glenn Close's 'Beanfield' estate ...,Photos: Look Glenn Close's 'Beanfield' estate ...
3,N114168,news,newscrime,Hillsborough Sheriff's Office sweep results in...,TAMPA More than 80 people have been arrested...,{'human trafficking': 'Human trafficking'},"{'Human trafficking': {'type': 'item', 'id': '...",Hillsborough Sheriff's Office sweep results in...,Hillsborough Sheriff's Office sweep results in...
4,N35279,video,peopleandplaces,Family of missing Connecticut mom blast 'Gone ...,Family members and friends of Jennifer Dulos s...,{'Connecticut': 'Connecticut'},"{'Connecticut': {'type': 'item', 'id': 'Q58425...",Family of missing Connecticut mom blast 'Gone ...,Family of missing Connecticut mom blast 'Gone ...


In [3]:
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text)
    text = re.sub(r"<.*?>", " ", text)
    text = re.sub(r"http\S+", " ", text)
    text = text.encode("ascii", "ignore").decode()
    text = text.translate(str.maketrans(string.punctuation, " "*len(string.punctuation)))
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    text = text.lower()
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["clean_text"] = df["text"].apply(clean_text)


In [4]:
def tokenize(text):
    return [w for w in word_tokenize(text) if w not in stop_words]

df["tokens"] = df["clean_text"].apply(tokenize)


In [5]:
le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])

# Save label encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

df[["category", "label"]].head()


Unnamed: 0,category,label
0,news,11
1,lifestyle,8
2,finance,4
3,news,11
4,video,16


In [7]:
df["label"].value_counts()


label
13    15163
11    13392
4      5259
8      3729
1      2779
14     2674
5      2645
16     2469
15     2005
6      1880
17     1621
10     1316
9      1011
2       758
7       147
3         2
12        1
0         1
Name: count, dtype: int64

In [8]:
class_counts = df["label"].value_counts()
valid_classes = class_counts[class_counts >= 2].index

df = df[df["label"].isin(valid_classes)].reset_index(drop=True)


In [9]:
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df["clean_text"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [10]:
bow = CountVectorizer(max_features=5000)
X_train_bow = bow.fit_transform(X_train_text)
X_test_bow = bow.transform(X_test_text)

pickle.dump(bow, open("bow_vectorizer.pkl", "wb"))


In [11]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))


In [12]:
sentences = df["tokens"].tolist()

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=50,
    window=3,
    min_count=2,
    workers=2,
    sg=0
)

pickle.dump(w2v_model, open("word2vec_model.pkl", "wb"))


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [13]:
def document_vector(tokens, model):
    tokens = [w for w in tokens if w in model.wv]
    if not tokens:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

X_w2v = np.array([document_vector(t, w2v_model) for t in df["tokens"]])

X_train_w2v, X_test_w2v, _, _ = train_test_split(
    X_w2v, df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


In [14]:
models = {
    "LogReg + BoW": LogisticRegression(max_iter=500),
    "LogReg + TFIDF": LogisticRegression(max_iter=500),
    "LogReg + W2V": LogisticRegression(max_iter=500)
}

results = []

models["LogReg + BoW"].fit(X_train_bow, y_train)
models["LogReg + TFIDF"].fit(X_train_tfidf, y_train)
models["LogReg + W2V"].fit(X_train_w2v, y_train)

results.append(["LogReg + BoW", accuracy_score(y_test, models["LogReg + BoW"].predict(X_test_bow))])
results.append(["LogReg + TFIDF", accuracy_score(y_test, models["LogReg + TFIDF"].predict(X_test_tfidf))])
results.append(["LogReg + W2V", accuracy_score(y_test, models["LogReg + W2V"].predict(X_test_w2v))])


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["clean_text"])

X_seq = tokenizer.texts_to_sequences(df["clean_text"])
X_pad = pad_sequences(X_seq, maxlen=100)

pickle.dump(tokenizer, open("tokenizer.pkl", "wb"))


In [16]:
X_train_pad, X_test_pad, y_train_dl, y_test_dl = train_test_split(
    X_pad, df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)


In [17]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = w2v_model.vector_size

embedding_matrix = np.zeros((vocab_size, embedding_dim))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]


In [19]:
model_bilstm = Sequential([
    Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=100, trainable=False),
    Bidirectional(LSTM(64)),
    Dense(32, activation="relu"),
    Dropout(0.2),
    Dense(len(le.classes_), activation="softmax")
])

model_bilstm.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

es = EarlyStopping(patience=3, restore_best_weights=True)

model_bilstm.fit(
    X_train_pad, y_train_dl,
    validation_split=0.2,
    epochs=3,
    batch_size=16,
    callbacks=[es]
)


Epoch 1/3
[1m2274/2274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m175s[0m 74ms/step - accuracy: 0.6260 - loss: 1.2225 - val_accuracy: 0.6867 - val_loss: 0.9468
Epoch 2/3
[1m2274/2274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 70ms/step - accuracy: 0.6875 - loss: 0.9828 - val_accuracy: 0.7007 - val_loss: 0.9155
Epoch 3/3
[1m2274/2274[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 70ms/step - accuracy: 0.7131 - loss: 0.9078 - val_accuracy: 0.7099 - val_loss: 0.8913


<keras.src.callbacks.history.History at 0x1e0fcb2dbd0>

In [20]:
y_pred_dl = np.argmax(model_bilstm.predict(X_test_pad), axis=1)
bilstm_acc = accuracy_score(y_test_dl, y_pred_dl)


[1m356/356[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 33ms/step


In [21]:
# Best ML
joblib.dump(models["LogReg + TFIDF"], "best_ml_model.pkl")

# Best DL
model_bilstm.save("best_dl_model.h5")




In [22]:
comparison_df = pd.DataFrame(
    results + [["BiLSTM + Word2Vec", bilstm_acc]],
    columns=["Model", "Accuracy"]
)

comparison_df.to_csv("classification_model_comparison.csv", index=False)
comparison_df


Unnamed: 0,Model,Accuracy
0,LogReg + BoW,0.725418
1,LogReg + TFIDF,0.76781
2,LogReg + W2V,0.730695
3,BiLSTM + Word2Vec,0.701407
