## 1. Load data


In [1]:
def load_data(file_path):
    """
    Load data from the text file into a pandas DataFrame.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read the first line to determine the separator
        first_line = file.readline().strip()
        num = int(first_line)
        # Read non-empty lines, strip line separators
        lines = [line.strip() for line in file.readlines() if line.strip()]
    data = []
    for line in lines:
        label, text = line.split(' ', 1)
        data.append((int(label), text))
    return num, data

num, data = load_data("trainingdata.txt")
if num == len(data):
    print("All data loaded successfully.")
    print(f"Sample data:")  # Print first 5 samples for verification
    for i in range(min(5, len(data))):
        print(f"Category {data[i][0]}: {data[i][1]}")
else:
    print(f"Warning: Expected {num} samples, but loaded {len(data)} samples.")

All data loaded successfully.
Sample data:
Category 1: champion products ch approves stock split champion products inc said its board of directors approved a two for one stock split of its common shares for shareholders of record as of april the company also said its board voted to recommend to shareholders at the annual meeting april an increase in the authorized capital stock from five mln to mln shares reuter
Category 1: cobanco inc cbco year net shr cts vs dlrs net vs assets mln vs mln deposits mln vs mln loans mln vs mln note th qtr not available year includes extraordinary gain from tax carry forward of dlrs or five cts per shr reuter
Category 1: am international inc am nd qtr jan oper shr loss two cts vs profit seven cts oper shr profit vs profit revs mln vs mln avg shrs mln vs mln six mths oper shr profit nil vs profit cts oper net profit vs profit revs mln vs mln avg shrs mln vs mln note per shr calculated after payment of preferred dividends results exclude credits of or four

In [2]:
import pandas as pd

def load_df(file_path):
	_, data = load_data(file_path)
	df = pd.DataFrame(data, columns=['label', 'text'])
	return df

train_df = load_df("trainingdata.txt")
test_df = load_df("testingdata.txt")

print("Training DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())


Training DataFrame:
   label                                               text
0      1  champion products ch approves stock split cham...
1      1  cobanco inc cbco year net shr cts vs dlrs net ...
2      1  am international inc am nd qtr jan oper shr lo...
3      1  brown forman inc bfd th qtr net shr one dlr vs...
4      1  dean foods df sees strong th qtr earnings dean...

Test DataFrame:
   label                                               text
0      1  thackeray corp thk year loss oper shr loss cts...
1      1  allied lyons sees substantial second half grow...
2      1  dryclean usa year net shr cts vs cts net vs re...
3      2  first financial ffmc acquires tel a data first...
4      2  delta says court order will not delay merger d...


## 2. TF-IDF Vectorizer and Model Selection

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1, 2), min_df=2)
X_train = vectorizer.fit_transform(train_df['text'])
X_test = vectorizer.transform(test_df['text'])

y_train = train_df['label']
y_test = test_df['label']

models = {
    "LogReg": LogisticRegression(max_iter=2000),
    "LinearSVC": LinearSVC()
}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring="f1_macro")
    print(f"{name} mean F1: {scores.mean():.3f}")

LogReg mean F1: 0.700
LinearSVC mean F1: 0.725


## 3. Fitting and Model Evaluation

In [4]:
import numpy as np

clf = LinearSVC()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

def get_category_keywords(df, vectorizer, top_n=8):
    keywords = {}
    for cat in sorted(df.label.unique()):
        texts = df[df.label == cat]["text"]
        vec = vectorizer.transform(texts)
        mean_tfidf = np.asarray(vec.mean(axis=0)).flatten()
        top_ids = mean_tfidf.argsort()[::-1][:top_n]
        keywords[cat] = [vectorizer.get_feature_names_out()[i] for i in top_ids]
    return keywords

category_keywords = get_category_keywords(train_df, vectorizer)
for cat, words in category_keywords.items():
    print(f"Category {cat}: {', '.join(words)}")

              precision    recall  f1-score   support

           1      1.000     1.000     1.000         3
           2      1.000     0.667     0.800         3
           3      0.750     1.000     0.857         3
           4      1.000     0.667     0.800         3
           5      0.750     1.000     0.857         3
           6      1.000     1.000     1.000         3
           7      0.333     0.333     0.333         3
           8      0.000     0.000     0.000         3

    accuracy                          0.708        24
   macro avg      0.729     0.708     0.706        24
weighted avg      0.729     0.708     0.706        24

Confusion matrix:
 [[3 0 0 0 0 0 0 0]
 [0 2 0 0 0 0 0 1]
 [0 0 3 0 0 0 0 0]
 [0 0 0 2 1 0 0 0]
 [0 0 0 0 3 0 0 0]
 [0 0 0 0 0 3 0 0]
 [0 0 0 0 0 0 1 2]
 [0 0 1 0 0 0 2 0]]
Category 1: vs, mln, cts, vs mln, mln vs, shr, share, dlrs
Category 2: shares, said, barbara, santa barbara, santa, dlrs, company, corp
Category 3: trade, gatt, chief, commissio