In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import joblib
import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

In [2]:
df = pd.read_csv('../data/final_data.csv')

In [3]:
df.isna().sum() / len(df)

short_descriptions    0.016200
assignment_groups     0.016146
dtype: float64

In [4]:
df.dropna(inplace=True)

In [5]:
X_org = df['short_descriptions']
y_org = df['assignment_groups']

In [6]:
vectorizer = CountVectorizer()
encoder = LabelEncoder()

y_org_enc = encoder.fit_transform(y_org)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_org, y_org_enc, test_size=0.2, random_state=0)

In [8]:
len(y_train), len(y_test), y_train.dtype, y_test.dtype

(130679, 32670, dtype('int32'), dtype('int32'))

In [9]:
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
model = LogisticRegression(n_jobs=-1, verbose=2, random_state=0)

In [11]:
model.fit(X_train_vec, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  8.9min finished


LogisticRegression(n_jobs=-1, random_state=0, verbose=2)

In [12]:
def display_metrics(true, pred):
    f1 = round(f1_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    precision = round(
        precision_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    recall = round(
        recall_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_true=y_test, y_pred=preds) * 100, 2)

    print(f'Acc: {acc}')
    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

In [13]:
preds = model.predict(X_test_vec)

In [14]:
display_metrics(true=y_test, pred=preds)

Acc: 83.08
F1: 81.53
Precision: 81.63
Recall: 83.08


In [15]:
predicted = encoder.inverse_transform(preds)
actual = encoder.inverse_transform(y_test)
actual_descriptions = X_test.copy()

In [16]:
result_df = pd.DataFrame(
    {
        'short_descriptions':actual_descriptions,
        'true_assignment_group': actual,
        'pred_assignment_group': predicted
    }
)

In [17]:
result_df.head(10)

Unnamed: 0,short_descriptions,true_assignment_group,pred_assignment_group
155077,Update DNS Resolvers GBL-WTI-AHS UNIX L1,GBL-WTI-AHS UNIX L1,GBL-WTI-AHS UNIX L1
30086,IT- Issue with IT3PL0154,G SNOW IT SERVICE DESK,G SNOW IT SERVICE DESK
32942,bh-dc-os-dhn-40.eecloud.nsn-net.net - 10.131....,SD_1stL_SD-UC_NSN_ENG_PH,SD_1stL_SD-UC_NSN_ENG_PH
147999,HCP restriction removal,Service Desk - Sales All,Service Desk - Sales All
101803,Multiple remote sites LAN and WLAN connectivit...,SD_1stL_SD-UC_NSN_ENG_PH,SD_1stL_SD-UC_NSN_ENG_PH
151055,Attach pdf document,Service Desk - Non Sales,Service Desk - Non Sales
61527,Role assignment for CDO IT Procurement Team me...,I-Buy Process Support,I-Buy Process Support
51747,PO#5020151697 closed => Please reopen,SD_UC_CoCare_NSN_GL,G SNOW EXT ATOS MTC FSM
18080,PL - Assign access to role,G SNOW PL SERVICE DESK,G SNOW PL SERVICE DESK
56435,Not able to import issue in Jira - No space le...,SD_1stL_SD-UC_NSN_ENG_PH,SD_1stL_SD-UC_NSN_ENG_PH


In [19]:
joblib.dump(value=model, filename='../models/log_reg_tkt', compress=2)

['../models/log_reg_tkt']

In [20]:
joblib.dump(value=vectorizer, filename='../models/vectorizer', compress=2)

['../models/vectorizer']

In [21]:
joblib.dump(value=encoder, filename='../models/encoder', compress=2)

['../models/encoder']

In [23]:
X_test.to_csv('../data/testing_data.csv', index=False)

In [24]:
y_test_org = encoder.inverse_transform(y_test)

In [28]:
pd.DataFrame({'short_descriptions':X_test.values,
              'true_asgn_grps': y_test_org}).to_csv('../data/testing_data.csv', index=False)