_Imports_

In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
from azstemmer import AzStemmer
from scipy.sparse import hstack
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier
from functions import add_spacing, remove_stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import CountVectorizer

tqdm.pandas()
stemmer = AzStemmer(keyboard="az")

_Load Dataset_

In [None]:
# load data
df = pd.read_parquet("./data/data.parquet")
df.dropna(subset=['title', 'text', 'category'], inplace=True)

_Stem Data_

In [None]:
# stem data using azstemmer (https://pypi.org/project/azstemmer/)
# NOTE: Azstemmer is a library created by me to reduce words into their roots 

print("Stemming...")
df['title'] = df['title'].progress_apply(stemmer.stem)
df['text'] = df['text'].progress_apply(stemmer.stem)

# adding spacings between symbols, numbers and words
print("Adding spacings...")
df['title'] = df['title'].progress_apply(add_spacing)
df['text'] = df['text'].progress_apply(add_spacing)

_Remove Stopwords_

In [None]:
# removing stop words 
# NOTE: stop wordləri silməsək biraz daha yaxşı nəticə verir amma göstərmə məqsədi ilə sildim
print("Removing stop words...")
df['title'] = df['title'].progress_apply(remove_stopwords)
df['text'] = df['text'].progress_apply(remove_stopwords)

_Split Data_

In [None]:
# split data into train and test
title = df['title']
text = df['text']
y = df['category']

title_train, title_test, text_train, text_test, y_train, y_test = train_test_split(title, text, y, stratify=y, test_size=0.2, random_state=42)

_Vectorize Data_

In [None]:
# vectorize texts using count vectorizer
title_vectorizer = CountVectorizer(max_features=7500, min_df=2, max_df=0.95)
text_vectorizer = CountVectorizer(max_features=15000, min_df=2, max_df=0.95)

title_train_vec = title_vectorizer.fit_transform(title_train)
text_train_vec = text_vectorizer.fit_transform(text_train)
X_train_vec = hstack([title_train_vec, text_train_vec])

title_test_vec = title_vectorizer.transform(title_test)
text_test_vec = text_vectorizer.transform(text_test)
X_test_vec = hstack([title_test_vec, text_test_vec])

_Set Balanced Class Weights_

In [None]:
# initialize class weights balanced in order to prevent overfitting
classes = y.unique()
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weights = dict(zip(classes, class_weights))
sample_weights = y_train.map(weights)

_Training_

In [None]:
# initialize and fit model
model = CatBoostClassifier(iterations=100, learning_rate=0.13, early_stopping_rounds=30, random_state=42)
model.fit(X_train_vec, y_train, sample_weight=sample_weights)

_Evaluation_

In [None]:
# evaluate model's performance on train and test data
y_pred_train = model.predict(X_train_vec)
train_score = f1_score(y_train, y_pred_train, average='weighted')

y_pred_test = model.predict(X_test_vec)
test_score = f1_score(y_test, y_pred_test, average='weighted')

print("Train Weighted F1 Score:", train_score)
print("Test Weighted F1 Score:", test_score)

In [None]:
# compute confusion matrix
cm = confusion_matrix(y_test, y_pred_test, labels=model.classes_)

cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
cm_percent = np.nan_to_num(cm_percent)

plt.figure(figsize=(16, 12))
sns.heatmap(cm_percent, annot=True, fmt=".2f", cmap="YlOrRd",
            xticklabels=model.classes_, yticklabels=model.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Percentages)')
plt.tight_layout()
plt.show()

_Save model_

In [None]:
os.makedirs("./models/catboost", exist_ok=True)
with open("./models/catboost/title_vectorizer.pkl", "wb") as f:
    pickle.dump(title_vectorizer, f)

with open("./models/catboost/text_vectorizer.pkl", "wb") as f:
    pickle.dump(text_vectorizer, f)

model.save_model("./models/catboost/model.cbm")