In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score
import matplotlib.pyplot as plt
import numpy as np


In [81]:
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
temp_set = pd.read_json("hf://datasets/sh0416/ag_news/" + splits["train"], lines=True)
test_set = pd.read_json("hf://datasets/sh0416/ag_news/" + splits["test"], lines=True)

train_set, val_set = train_test_split(temp_set, test_size=0.1)

In [82]:
def separate_labels_text(set: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    x = set.drop(columns=["label"])
    y = set["label"]

    return x,y 

ag_train, y_train = separate_labels_text(train_set)
ag_val, y_val = separate_labels_text(val_set)
ag_test, y_test = separate_labels_text(test_set)

In [83]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

X_train = tfidf.fit_transform(ag_train["title"]+ag_train["description"])
X_val = tfidf.transform(ag_val["title"]+ag_val["description"])
X_test = tfidf.transform(ag_test["title"]+ag_test["description"])




In [84]:
print(X_train)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 1821126 stored elements and shape (108000, 5000)>
  Coords	Values
  (0, 2315)	0.3682716628747734
  (0, 63)	0.14929878428808951
  (0, 789)	0.25609629646631094
  (0, 2883)	0.25938368292731045
  (0, 608)	0.18023286272425898
  (0, 2288)	0.14510910068144578
  (0, 415)	0.218375145919774
  (0, 1525)	0.1799257942863951
  (0, 223)	0.17386587457034536
  (0, 534)	0.1852029848013651
  (0, 3971)	0.15791067383751484
  (0, 2755)	0.4681631903347782
  (0, 4986)	0.12434476241419706
  (0, 4765)	0.2404714498033257
  (0, 3982)	0.14796078466543375
  (0, 1069)	0.19049214671034992
  (0, 1520)	0.14425331481369189
  (0, 4870)	0.1295129185251378
  (0, 1795)	0.19370024135402827
  (0, 3011)	0.1702530803678367
  (0, 3438)	0.17725905994123434
  (1, 63)	0.08649362511507508
  (1, 1520)	0.16714124220899954
  (1, 250)	0.44232163710409395
  (1, 1514)	0.23026227180773629
  :	:
  (107998, 4037)	0.2597332646494897
  (107998, 4982)	0.14753170202825136
  (107998, 1

In [None]:
def grid_search(param_grid, x_train, y_train, x_val, y_val):
    best_model = None
    best_macrof1 = 0
    best_params = None

    for c in param_grid["C"]:
        for l1_ratio in param_grid["l1_ratio"]:
            for solver in param_grid["solver"]:
                for max_iter in param_grid["max_iter"]:
                    if solver in ["lbfgs", "newton-cg", "sag"] and l1_ratio != 0:
                        continue

                    if 0 < l1_ratio < 1 and solver != "saga":
                        continue

                    logmodel = LogisticRegression(
                        C=c,
                        l1_ratio=l1_ratio,
                        solver=solver,
                        max_iter=max_iter,
                        random_state=42,
                    )

                    logmodel.fit(x_train, y_train)

                    y_pred = logmodel.predict(x_val)
                    
                    new_score = f1_score(y_val, y_pred, average="macro")

                    if new_score > best_macrof1:
                        best_macrof1 = new_score
                        best_model = logmodel
                        best_params = {
                            "C": c,
                            "l1_ratio": l1_ratio,
                            "solver": solver,
                            "max_iter": max_iter,
                        }

    return best_model, best_macrof1, best_params

In [None]:
param_grid = {
    "C": np.logspace(-4, 2, 8),
    "l1_ratio": [0, 0.2, 0.4, 0.6, 0.8, 1],
    "solver": ["lbfgs", "newton-cg", "sag", "saga"],
    "max_iter": [50, 100, 500, 1000],
}

best_model, best_macrof1, best_params = grid_search(param_grid, X_train, y_train, X_val, y_val)

print(best_model)
print(best_macrof1)
print(best_params)

KeyboardInterrupt: 

In [None]:
y_pred_test = best_model.predict(X_test)

print(classification_report(y_test, y_pred_test))

cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(xticks_rotation='vertical')
plt.title("Confusion Matrix: TF-IDF + Logistic Regression")
plt.show()

In [None]:
def grid_search_svm(param_grid, x_train, y_train, x_val, y_val):
    best_model = None
    best_macrof1 = 0
    best_params = None

    for c in param_grid["C"]:
        for penalty in param_grid["penalty"]:
            for loss in param_grid["loss"]:
                if penalty == "l1" and loss == "hinge":
                    continue

                svm = LinearSVC(
                    C=c,
                    penalty=penalty,
                    loss=loss,
                    random_state=42,
                    multi_class="ovr",
                )

                svm.fit(x_train, y_train)

                y_pred = svm.predict(x_val)
                
                new_score = f1_score(y_val, y_pred, average="macro")

                if new_score > best_macrof1:
                    best_macrof1 = new_score
                    best_model = svm
                    best_params = {
                        "C": c,
                        "penalty": penalty,
                        "loss": loss,
                    }

    return best_model, best_macrof1, best_params

In [None]:
param_grid_svm = {
    "C": np.logspace(-4, 2, 8),
    "penalty": ["l1", "l2"],
    "loss": ["hinge", "squared_hinge"]
}

best_model_svm, best_macrof1_svm, best_params_svm = grid_search_svm(
    param_grid_svm, X_train, y_train, X_val, y_val
)

print(best_model_svm)
print(best_macrof1_svm)
print(best_params_svm)

In [None]:
y_pred_test_svm = best_model_svm.predict(X_test)

print(classification_report(y_test, y_pred_test_svm))

cm = confusion_matrix(y_test, y_pred_test_svm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(xticks_rotation='vertical')
plt.title("Confusion Matrix: TF-IDF + Linear SVM")
plt.show()

In [None]:
# AG News label mapping
label_names = {1: "World", 2: "Sports", 3: "Business", 4: "Sci/Tech"}

# --- Logistic Regression Error Analysis ---
df_predictions_lr = pd.DataFrame({
    'text': ag_test["title"] + ag_test["description"],
    'true_label': y_test.map(label_names),
    'pred_label': pd.Series(y_pred_test).map(label_names).values
})

errors_lr = df_predictions_lr[df_predictions_lr['true_label'] != df_predictions_lr['pred_label']]

print("=== Logistic Regression ===")
print(f"Total Errors: {len(errors_lr)}")
print("Displaying first 20 misclassifications:")
display(errors_lr.head(20))

# --- LinearSVC Error Analysis ---
df_predictions_svm = pd.DataFrame({
    'text': ag_test["title"] + ag_test["description"],
    'true_label': y_test.map(label_names),
    'pred_label': pd.Series(y_pred_test_svm).map(label_names).values
})

errors_svm = df_predictions_svm[df_predictions_svm['true_label'] != df_predictions_svm['pred_label']]

print("\n=== LinearSVC ===")
print(f"Total Errors: {len(errors_svm)}")
print("Displaying first 20 misclassifications:")
display(errors_svm.head(20))