In [None]:
import pandas as pd

df = pd.read_csv('ag_news_train.csv')
print(df.columns)  # Check column names
print(df.head())

# Use the actual label column name
print(df['Class Index'].value_counts())

# Combine Title and Description into one feature
df['text'] = df['Title'] + " " + df['Description']


In [None]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = "".join([c for c in text if c not in string.punctuation])
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

df['cleaned'] = df['text'].apply(preprocess)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned'])


In [None]:
from sklearn.model_selection import train_test_split

y = df['Class Index']  # or df['Category'] if you mapped class names
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
print("----------------------------------------------------------- Logistic Regression Prediction Accuracy Results -----------------------------------------------")
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print(classification_report(y_test, y_pred_logreg))


In [None]:
import numpy as np

y_train_fixed = np.array(y_train) - np.min(y_train)
y_test_fixed = np.array(y_test) - np.min(y_train)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

print("-" * 50, "Prediction Accuracy Results", "-" * 50)
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train_fixed)
y_pred_logreg = logreg.predict(X_test)

print("\nLogistic Regression Accuracy:", accuracy_score(y_test_fixed, y_pred_logreg))
print(classification_report(y_test_fixed, y_pred_logreg, digits=4))
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train_fixed)
y_pred_xgb = xgb.predict(X_test)

print("\nXGBoost Accuracy:", accuracy_score(y_test_fixed, y_pred_xgb))
print(classification_report(y_test_fixed, y_pred_xgb, digits=4))
