In [1]:
import pandas as pd
import os

In [2]:
path = 'aa_dataset-tickets-multi-lang-5-2-50-version.csv'
df = pd.read_csv('../data/data-support-tickets/' + path)
df.shape

(28587, 16)

In [3]:
columns_incloud = ['subject', 'body', 'queue', 'priority', 'language']

df = df[columns_incloud]

df.loc[:, 'subject'] = df['subject'].fillna("no subject")
df.loc[:, 'text'] = df['subject'] + " " + df['body']
df = df.drop(['subject', 'body'], axis=1)

df = df[df.language == 'en']
df = df.drop(['language'], axis=1)

df.isna().sum()

queue       0
priority    0
text        0
dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

x_train_queue, x_valid_queue, y_train_queue, y_valid_queue = train_test_split(df.text, df.queue, test_size=0.2, random_state=42)

x_train_priority, x_valid_priority, y_train_priority, y_valid_priority = train_test_split(df.text, df.priority, test_size=0.2, random_state=42)

In [8]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

from lightgbm import LGBMClassifier

In [9]:
pipeline = Pipeline(steps=[
    ('tf_idf', TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=3)),
    ('model', LGBMClassifier(class_weight='balanced', n_estimators=300, random_state=42))
])

In [10]:
pipeline.fit(x_train_queue, y_train_queue)

f1_macro_queue_train = f1_score(y_train_queue, pipeline.predict(x_train_queue), average='macro')

y_pred_queue = pipeline.predict(x_valid_queue)
f1_macro_queue = f1_score(y_valid_queue, y_pred_queue, average='macro')
print(f"Macro F1: train {f1_macro_queue_train:.4f}, test {f1_macro_queue:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.098497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 275981
[LightGBM] [Info] Number of data points in the train set: 13070, number of used features: 6987
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585
[LightGBM] [Info] Start training from score -2.302585




Macro F1: train 1.0000, test 0.6509


In [11]:
pipeline.fit(x_train_priority, y_train_priority)

f1_macro_priority_train = f1_score(y_train_priority, pipeline.predict(x_train_priority), average='macro')

y_pred_priority = pipeline.predict(x_valid_priority)
f1_macro_priority = f1_score(y_valid_priority, y_pred_priority, average='macro')
print(f"Macro F1: train {f1_macro_priority_train:.4f}, test {f1_macro_priority:.4f}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.094976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 275981
[LightGBM] [Info] Number of data points in the train set: 13070, number of used features: 6987
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612
[LightGBM] [Info] Start training from score -1.098612




Macro F1: train 0.9849, test 0.6490




In [12]:
overall_f1 = 0.3 * f1_macro_queue + 0.7 * f1_macro_priority
print(f"Mean Macro F1: {overall_f1:.4f}")

Mean Macro F1: 0.6496


In [13]:
# 2: Комбинированная точность (joint accuracy)
joint_acc = np.mean((y_valid_queue == y_pred_queue) & (y_valid_priority == y_pred_priority))
print(f"Joint accuracy: {joint_acc:.4f}")

Joint accuracy: 0.4841


In [14]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold

In [18]:
# parag_grid = {
#     'model__num_leaves': [31, 63],
#     'model__min_data_in_leaf': [20, 50],
#     'model__learning_rate': [0.01, 0.05, 0.1]  
# }

parag_grid = {
    'model__n_estimators': [300]
}

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

pipeline_queue = Pipeline(steps=[
    ('tf_idf', TfidfVectorizer(max_features=20000, ngram_range=(1, 2), min_df=3)),
    ('model', LGBMClassifier(class_weight='balanced', n_estimators=300, random_state=42, verbosity=2))
])

grid_queue = GridSearchCV(
    estimator = pipeline_queue,
    param_grid = parag_grid,
    cv = cv_strategy,
    scoring='f1_macro',
    verbose=2,
    n_jobs=-1
)

grid_queue.fit(x_train_queue, y_train_queue)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


KeyboardInterrupt: 

In [None]:
grid_queue.cv_results_

In [None]:


y_pred_queue = grid_queue.predict(x_valid_queue)
f1_macro_queue = f1_score(y_valid_queue, y_pred_queue, average='macro')
print(f"Macro F1: train {f1_macro_queue_train:.4f}, test {f1_macro_queue:.4f}")