In [1]:
import time
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from dask_ml.wrappers import ParallelPostFit

from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('../data/final_data.csv')
df.tail()

Unnamed: 0,short_descriptions,assignment_groups
166044,Data files per our call,PZC-FJ-ServiceDesk
166045,Approver Matrix,PZC-FJ-ServiceDesk
166046,a new Product Hierarchy Level 5 and 6: Vegeta...,PZC-FJ-ServiceDesk
166047,[SAP] Create SAP account in PROD for Charala...,PZC-FJ-ServiceDesk
166048,APO Daily and Monthly Chains to be triggere...,PZC-FJ-ServiceDesk


In [3]:
df.isna().sum()

short_descriptions    2690
assignment_groups     2681
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
X = df['short_descriptions']
y = df['assignment_groups']

In [6]:
cnt_vec = CountVectorizer()
le = LabelEncoder()

In [7]:
X = cnt_vec.fit_transform(X)
y = le.fit_transform(y.astype(str))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [10]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(130679, 54408) (130679,) (32670, 54408) (32670,)


In [11]:
models = [
    MultinomialNB(),
    LogisticRegression(n_jobs=-1),
    RandomForestClassifier(n_jobs=-1),
#     SVC(),
    XGBClassifier(n_jobs=-1),
#     DecisionTreeClassifier()
]


def display_metrics(true, pred):
    f1 = round(f1_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    precision = round(precision_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    recall = round(recall_score(y_true=true, y_pred=pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_true=y_test, y_pred=preds) * 100, 2)

    print(f'Acc: {acc}')
    print(f'F1: {f1}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')

In [12]:
%%time
trained_models = dict()

for model in models:
    print(f'Training -> {model.__class__.__name__}')
    s = time.time()
    
    trained_models[model.__class__.__name__] = ParallelPostFit(
        estimator=model
    )
    trained_models[model.__class__.__name__].fit(X_train, y_train)
    
    e = time.time()
    
    preds = trained_models[model.__class__.__name__].predict(X_test)
    display_metrics(true=y_test, pred=preds)
    
    print(f'Training time: {round(e -s)} seconds')
    print('-' * 10)

Training -> MultinomialNB


  _warn_prf(average, modifier, msg_start, len(result))


Acc: 75.13
F1: 69.42
Precision: 73.12
Recall: 75.13
Training time: 1 seconds
----------
Training -> LogisticRegression


  _warn_prf(average, modifier, msg_start, len(result))


Acc: 83.08
F1: 81.53
Precision: 81.63
Recall: 83.08
Training time: 654 seconds
----------
Training -> RandomForestClassifier


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Acc: 81.82
F1: 80.07
Precision: 80.48
Recall: 81.82
Training time: 1084 seconds
----------
Training -> XGBClassifier
Acc: 57.09
F1: 41.5
Precision: 32.6
Recall: 57.09
Training time: 862 seconds
----------
Wall time: 51min 46s


In [13]:
import joblib

In [14]:
trained_models

{'MultinomialNB': ParallelPostFit(estimator=MultinomialNB()),
 'LogisticRegression': ParallelPostFit(estimator=LogisticRegression(n_jobs=-1)),
 'RandomForestClassifier': ParallelPostFit(estimator=RandomForestClassifier(n_jobs=-1)),
 'XGBClassifier': ParallelPostFit(estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                         colsample_bylevel=1, colsample_bynode=1,
                                         colsample_bytree=1, gamma=0, gpu_id=-1,
                                         importance_type='gain',
                                         interaction_constraints='',
                                         learning_rate=0.300000012,
                                         max_delta_step=0, max_depth=6,
                                         min_child_weight=1, missing=nan,
                                         monotone_constraints='()',
                                         n_estimators=100, n_jobs=-1,
                           

In [16]:
joblib.dump(value=trained_models['MultinomialNB'], filename='../models/naive_bayes_tkt.pkl', compress=True)

['../models/naive_bayes_tkt.pkl']

In [17]:
joblib.dump(value=trained_models['LogisticRegression'], filename='../models/log_reg_tkt.pkl', compress=True)

['../models/log_reg_tkt.pkl']

In [19]:
joblib.dump(value=trained_models['RandomForestClassifier'], filename='../models/random_forest_tkt.joblib', compress=True)

['../models/random_forest_tkt.joblib']

In [20]:
joblib.dump(value=trained_models['XGBClassifier'], filename='../models/xgb_tkt.pkl', compress=True)

['../models/xgb_tkt.pkl']

In [21]:
joblib.dump(value=cnt_vec, filename='../models/vectorizer.pkl')
joblib.dump(value=le, filename='../models/encoder.pkl')

['../models/encoder.pkl']

In [22]:
type(X_train), type(X_test), type(y_train), type(y_test)

(scipy.sparse.csr.csr_matrix,
 scipy.sparse.csr.csr_matrix,
 numpy.ndarray,
 numpy.ndarray)

In [25]:
y_train_orig = le.inverse_transform(y_train)

In [26]:
y_train_orig

array(['GBL-WTI-AHS UNIX L1', 'SD_UC_CoCare_NSN_GL',
       'G SNOW RO SERVICE DESK', ..., 'Service Desk - Non Sales',
       'SD_1stL_SD-UC_NSN_ENG_PH', 'SD_1stL_SD-UC_NSN_ENG_PH'],
      dtype=object)

In [27]:
model = joblib.load(filename='../models/log_reg_tkt.pkl')

In [28]:
X_org = df['short_descriptions']
y_org = df['assignment_groups']

In [29]:
vectorizer = joblib.load(filename='../models/vectorizer.pkl')
encoder = joblib.load(filename='../models/encoder.pkl')

In [30]:
X_org_vec = vectorizer.transform(X_org)
y_org_enc = encoder.transform(y_org)

In [33]:
preds = model.predict(X_org_vec)

In [34]:
accuracy_score(y_true=y_org_enc, y_pred=preds)

0.8757996681950915

In [35]:
f1_score(y_true=y_org_enc, y_pred=preds, average='weighted')

0.8662387130181455

In [36]:
precision_score(y_true=y_org_enc, y_pred=preds, average='weighted')

  _warn_prf(average, modifier, msg_start, len(result))


0.8723425898308479

In [37]:
recall_score(y_true=y_org_enc, y_pred=preds, average='weighted')

0.8757996681950915