In [None]:
import pandas as pd
import numpy as np
import string
import joblib
import os

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# --------- Preprocessing ----------
stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = str(text).lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# --------- Load Data ----------
df = pd.read_csv("news.csv")

# Fill missing
df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')
df['content'] = df['content'].fillna('')
df['text'] = df['title'] + ' ' + df['description'] + ' ' + df['content']

# Drop rows without category
df = df.dropna(subset=['category'])

# Optional: Limit to top categories
top_categories = df['category'].value_counts().nlargest(3).index
df = df[df['category'].isin(top_categories)]

df['text'] = df['text'].apply(preprocess)

# --------- Vectorize ----------
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])

# Reduce dimensions for dense GaussianNB
X_reduced = X.toarray()
y = df['category']

# --------- Train Model ----------
X_train, X_test, y_train, y_test = train_test_split(X_reduced, y, test_size=0.2, random_state=42, stratify=y)

model = GaussianNB()
model.fit(X_train, y_train)

# Evaluate
print("Accuracy:", accuracy_score(y_test, model.predict(X_test)))
print("Classification Report:\n", classification_report(y_test, model.predict(X_test)))

# --------- Save Model & Vectorizer ----------
os.makedirs("output_model", exist_ok=True)
joblib.dump(model, "output_model/naive_bayes_model.pkl")
joblib.dump(vectorizer, "output_model/tfidf_vectorizer.pkl")

print(" Model and vectorizer saved.")
