## Issue type Model

### 2. Feature Engineering

#### Using Word2Vec technique to convert my preprocessed data into vector

In [1]:

import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
df = pd.read_csv('../dataset/preprocessed_dataset.csv')

In [3]:
df

Unnamed: 0,ticket_id,ticket_text,issue_type,urgency_level,product,preprocessed_tokens
0,1,Payment issue for my SmartWatch V2. I was unde...,Billing Problem,Medium,SmartWatch V2,"['payment', 'issue', 'smartwatch', 'v', 'under..."
1,3,I ordered SoundWave 300 but got EcoBreeze AC i...,Wrong Item,Medium,SoundWave 300,"['order', 'soundwave', 'get', 'ecobreeze', 'ac..."
2,4,Facing installation issue with PhotoSnap Cam. ...,Installation Issue,Low,PhotoSnap Cam,"['face', 'installation', 'issue', 'photosnap',..."
3,6,Can you tell me more about the PhotoSnap Cam w...,General Inquiry,Medium,PhotoSnap Cam,"['tell', 'photosnap', 'cam', 'warranty', 'also..."
4,7,is malfunction. It stopped working after just...,Product Defect,Low,EcoBreeze AC,"['malfunction', 'stop', 'work', 'day']"
...,...,...,...,...,...,...
821,995,Is this item in stock?,General Inquiry,High,RoboChef Blender,"['item', 'stock']"
822,996,I ordered EcoBreeze AC but got FitRun Treadmil...,Wrong Item,High,EcoBreeze AC,"['order', 'ecobreeze', 'ac', 'get', 'fitrun', ..."
823,997,I ordered SoundWave 300 but got PowerMax Batte...,Wrong Item,Low,SoundWave 300,"['order', 'soundwave', 'get', 'powermax', 'bat..."
824,999,Payment issue fr mi SoundWave 300. I was debit...,Billing Problem,Low,SoundWave 300,"['payment', 'issue', 'fr', 'mi', 'soundwave', ..."


In [4]:
w2v_model = Word2Vec(sentences=df['preprocessed_tokens'], vector_size=100, window=5, min_count=1, workers=4, seed=42)

In [5]:
w2v_model.raw_vocab

defaultdict(int, {})

In [6]:
def document_vector(tokens, model):
    
    # Removing out-of-vocab words
    tokens = [word for word in tokens if word in model.wv]
    if len(tokens) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[tokens], axis=0)

In [7]:
X_w2v = np.vstack(df['preprocessed_tokens'].apply(lambda tokens: document_vector(tokens, w2v_model)).values)
y = df['issue_type']

In [8]:
X_w2v

array([[-0.03835633,  0.01621188,  0.12592445, ...,  0.13041556,
        -0.11522511, -0.10828656],
       [-0.02761844,  0.02240099,  0.12205932, ...,  0.12959276,
        -0.11338402, -0.11029061],
       [-0.05680327,  0.00433684,  0.13940741, ...,  0.12366535,
        -0.11794174, -0.1124865 ],
       ...,
       [-0.03514046,  0.01610858,  0.12875716, ...,  0.12810971,
        -0.11484641, -0.11253895],
       [-0.03865514,  0.01668728,  0.12724738, ...,  0.13115376,
        -0.11548582, -0.10942892],
       [-0.03420815,  0.01476219,  0.12976038, ...,  0.13020842,
        -0.1137853 , -0.11635811]], dtype=float32)

In [9]:
y

0         Billing Problem
1              Wrong Item
2      Installation Issue
3         General Inquiry
4          Product Defect
              ...        
821       General Inquiry
822            Wrong Item
823            Wrong Item
824       Billing Problem
825        Product Defect
Name: issue_type, Length: 826, dtype: object

In [10]:
# Saving my Word2Vec model
w2v_model.save('../models/w2v_model.model')

### Creating extra feature [ Text length, Sentiment Score]

In [11]:
from textblob import TextBlob

def extract_custom_features(text_series):
    # Extracts length and sentiment
    text_length = text_series.apply(len)
    sentiment = text_series.apply(lambda x: TextBlob(x).sentiment.polarity)
    return pd.DataFrame({
        'text_length': text_length,
        'sentiment': sentiment
    })

In [12]:
extra_features = extract_custom_features(df['ticket_text'])

In [13]:
extra_features

Unnamed: 0,text_length,sentiment
0,71,0.0
1,80,0.0
2,68,-0.5
3,84,0.3
4,54,0.0
...,...,...
821,22,0.0
822,83,0.0
823,143,0.0
824,77,0.0


In [14]:
from sklearn.preprocessing import FunctionTransformer

# Create transformer
custom_feature_transformer = FunctionTransformer(extract_custom_features, validate=False)

# Example usage
features = custom_feature_transformer.transform(df['ticket_text'])

In [15]:
features

Unnamed: 0,text_length,sentiment
0,71,0.0
1,80,0.0
2,68,-0.5
3,84,0.3
4,54,0.0
...,...,...
821,22,0.0
822,83,0.0
823,143,0.0
824,77,0.0


In [16]:
import joblib

In [17]:
# Save the custom feature transformer
joblib.dump(custom_feature_transformer, '../models/custom_feature_transformer.pkl')

['../models/custom_feature_transformer.pkl']

In [18]:
# Combining extra_features (DataFrame) and X_w2v (numpy array) into a single feature matrix X

X = np.hstack([extra_features.values, X_w2v])

In [19]:
X

array([[ 7.10000000e+01,  0.00000000e+00, -3.83563340e-02, ...,
         1.30415559e-01, -1.15225114e-01, -1.08286560e-01],
       [ 8.00000000e+01,  0.00000000e+00, -2.76184417e-02, ...,
         1.29592761e-01, -1.13384023e-01, -1.10290609e-01],
       [ 6.80000000e+01, -5.00000000e-01, -5.68032674e-02, ...,
         1.23665348e-01, -1.17941745e-01, -1.12486497e-01],
       ...,
       [ 1.43000000e+02,  0.00000000e+00, -3.51404585e-02, ...,
         1.28109708e-01, -1.14846408e-01, -1.12538949e-01],
       [ 7.70000000e+01,  0.00000000e+00, -3.86551432e-02, ...,
         1.31153762e-01, -1.15485817e-01, -1.09428920e-01],
       [ 6.60000000e+01, -2.00000000e-01, -3.42081487e-02, ...,
         1.30208418e-01, -1.13785297e-01, -1.16358109e-01]])

In [20]:
from sklearn.model_selection import train_test_split

X_train_issue, X_test_issue, y_train_issue, y_test_issue = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42
)

In [21]:
X_train_issue


array([[ 7.70000000e+01,  0.00000000e+00, -4.20316011e-02, ...,
         1.25635013e-01, -1.16944209e-01, -1.17210083e-01],
       [ 1.14000000e+02, -3.00000000e-01, -3.76265682e-02, ...,
         1.28414065e-01, -1.15328103e-01, -1.13157041e-01],
       [ 1.23000000e+02, -5.00000000e-01, -4.90929224e-02, ...,
         1.25079900e-01, -1.17152065e-01, -1.14220090e-01],
       ...,
       [ 5.40000000e+01, -3.00000000e-01, -2.74011977e-02, ...,
         1.34094074e-01, -1.12723649e-01, -1.08312480e-01],
       [ 1.01000000e+02, -1.00000000e-01, -3.68981063e-02, ...,
         1.30426228e-01, -1.14954956e-01, -1.07263356e-01],
       [ 8.50000000e+01,  2.44444444e-01, -4.11610343e-02, ...,
         1.30203813e-01, -1.19940065e-01, -1.10152148e-01]])

In [22]:
y_train_issue

458        Account Access
784         Late Delivery
404    Installation Issue
734        Account Access
610       General Inquiry
              ...        
646        Product Defect
406    Installation Issue
352         Late Delivery
338       Billing Problem
645       General Inquiry
Name: issue_type, Length: 660, dtype: object

In [23]:
from sklearn.preprocessing import LabelEncoder

issue_encoder = LabelEncoder()

y_issue_train_enc = issue_encoder.fit_transform(y_train_issue)
y_issue_test_enc = issue_encoder.transform(y_test_issue)

In [24]:
y_issue_train_enc

array([0, 4, 3, 0, 2, 4, 5, 6, 3, 3, 5, 5, 1, 6, 1, 3, 6, 0, 6, 3, 5, 4,
       5, 2, 6, 0, 5, 4, 6, 2, 1, 2, 4, 5, 2, 0, 1, 6, 3, 2, 0, 1, 6, 2,
       1, 6, 2, 2, 2, 3, 0, 1, 0, 2, 6, 0, 2, 0, 5, 2, 1, 2, 0, 1, 0, 1,
       3, 2, 0, 2, 0, 2, 0, 1, 6, 1, 4, 2, 5, 4, 3, 2, 1, 6, 4, 5, 0, 6,
       2, 1, 4, 2, 6, 6, 5, 3, 1, 5, 3, 4, 0, 3, 6, 6, 4, 1, 1, 2, 0, 1,
       4, 3, 0, 6, 3, 1, 2, 3, 6, 2, 4, 1, 3, 4, 6, 2, 1, 1, 2, 4, 6, 3,
       6, 3, 2, 6, 6, 2, 1, 0, 2, 3, 5, 6, 0, 1, 3, 0, 2, 0, 2, 4, 3, 2,
       4, 1, 4, 2, 5, 0, 0, 6, 2, 3, 5, 0, 4, 3, 0, 6, 2, 3, 5, 0, 0, 5,
       5, 0, 4, 3, 3, 4, 0, 5, 2, 6, 0, 1, 3, 0, 5, 6, 3, 4, 0, 3, 0, 5,
       1, 6, 3, 3, 6, 1, 5, 1, 0, 0, 1, 6, 0, 2, 4, 1, 1, 0, 1, 5, 2, 0,
       6, 2, 1, 5, 5, 5, 4, 4, 5, 3, 2, 6, 1, 5, 2, 0, 0, 4, 3, 2, 4, 3,
       3, 3, 6, 4, 6, 0, 0, 1, 2, 3, 3, 2, 0, 0, 6, 5, 2, 0, 1, 1, 3, 4,
       6, 2, 1, 5, 6, 1, 4, 3, 0, 0, 1, 1, 0, 2, 2, 3, 2, 6, 2, 1, 2, 3,
       6, 5, 4, 0, 1, 5, 1, 3, 2, 1, 5, 4, 1, 3, 1,

In [25]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from scipy.special import softmax
from tqdm import tqdm

#### Function for hyperparameter tuning

In [26]:
def hyperparameter_tuning(X_train, y_train, X_val, y_val, models_params, 
                          scoring='f1_weighted', n_iter=20, cv=3, random_state=42):

    results = []
    best_score = -np.inf
    best_model = None
    best_name = None

    for name, (model, param_dist) in tqdm(models_params.items(), desc="Tuning models"):
        print(f"\n Tuning {name}...")
        try:
            search = RandomizedSearchCV(
                model, param_distributions=param_dist,
                n_iter=n_iter, scoring=scoring, cv=cv,
                random_state=random_state, n_jobs=-1, verbose=0
            )
            
            search.fit(X_train, y_train)

            y_pred = search.predict(X_val)
            acc = accuracy_score(y_val, y_pred)
            f1 = f1_score(y_val, y_pred, average='weighted')
            
            y_pred_proba = None
            if hasattr(search.best_estimator_, "predict_proba"):
                y_pred_proba = search.predict_proba(X_val)
            elif hasattr(search.best_estimator_, "decision_function"):
                y_pred_proba = softmax(search.decision_function(X_val), axis=1)

            loss = log_loss(y_val, y_pred_proba) if y_pred_proba is not None else None

            results.append({
                'model': name,
                'best_params': search.best_params_,
                'accuracy': acc,
                'f1_score': f1,
                'log_loss': loss
            })

            if f1 > best_score:
                best_score = f1
                best_model = search.best_estimator_
                best_name = name

        except Exception as e:
            print(f" Error with {name}: {e}")
            results.append({
                'model': name,
                'best_params': None,
                'accuracy': None,
                'f1_score': None,
                'log_loss': None
            })

    results_df = pd.DataFrame(results).sort_values(by='f1_score', ascending=False)

    return best_model, best_score, results_df



In [27]:
models_params = {
    'LogisticRegression': (
        LogisticRegression(max_iter=1000, class_weight='balanced', solver='saga'),
        {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2']
        }
    ),
    'LinearSVC': (
        LinearSVC(max_iter=10000, class_weight='balanced'),
        {
            'C': [0.01, 0.1, 1, 10, 100]
        }
    ),
    # 'MultinomialNB' is removed because input features contain negative values
    'RandomForest': (
        RandomForestClassifier(class_weight='balanced', n_jobs=-1),
        {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20, None],
            'min_samples_split': [2, 5, 10]
        }
    ),
    'XGBoost': (
        xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
        {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 10],
            'learning_rate': [0.01, 0.1, 0.2],
            'subsample': [0.7, 0.8, 1.0]
        }
    )
}


In [28]:
best_model, best_score, results_df = hyperparameter_tuning(X_train_issue, y_issue_train_enc, X_test_issue, y_issue_test_enc, models_params)

print("\nModel Metrics:")
for idx, row in results_df.iterrows():
    print(f"Model: {row['model']}")
    print(f"  Accuracy: {row['accuracy']:.4f}")
    print(f"  F1 Score: {row['f1_score']:.4f}")
    print(f"  Log Loss: {row['log_loss']:.4f}" if row['log_loss'] is not None else "  Log Loss: N/A")
    print(f"  Best Params: {row['best_params']}\n")




 Tuning LogisticRegression...





 Tuning LinearSVC...


Tuning models:  50%|█████     | 2/4 [00:19<00:16,  8.01s/it]


 Tuning RandomForest...


Tuning models:  75%|███████▌  | 3/4 [00:25<00:07,  7.27s/it]


 Tuning XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Tuning models: 100%|██████████| 4/4 [00:48<00:00, 12.15s/it]


Model Metrics:
Model: XGBoost
  Accuracy: 0.9639
  F1 Score: 0.9638
  Log Loss: 0.1147
  Best Params: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.1}

Model: RandomForest
  Accuracy: 0.9518
  F1 Score: 0.9518
  Log Loss: 0.2803
  Best Params: {'n_estimators': 100, 'min_samples_split': 2, 'max_depth': 20}

Model: LinearSVC
  Accuracy: 0.8795
  F1 Score: 0.8785
  Log Loss: 0.8721
  Best Params: {'C': 100}

Model: LogisticRegression
  Accuracy: 0.4940
  F1 Score: 0.4283
  Log Loss: 1.7128
  Best Params: {'penalty': 'l1', 'C': 100}






## Creating pipeline to train and predict the data

In [29]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.utils import simple_preprocess

In [31]:
def preprocess_series(series):
	# Replace this with your actual preprocessing logic
	return series.apply(lambda x: x.lower().split())

preprocess_transformer = joblib.load('../models/preprocess_transformer.pkl')

In [None]:
def preprocess_series(series):
	# Replace this with your actual preprocessing logic
	return series.apply(lambda x: x.lower().split())

preprocess_transformer.transform(pd.Series(['Payment issue for my SmartWatch V2. I was underbilled for order #29224']))

0    [payment, issue, for, my, smartwatch, v2., i, ...
dtype: object

In [None]:
custom_feature_transformer  = joblib.load('../models/custom_feature_transformer.pkl')

In [34]:
custom_feature_transformer .transform(pd.Series(['Payment issue for my SmartWatch V2.']))

Unnamed: 0,text_length,sentiment
0,35,0.0


In [35]:
# Word2Vec vectorizer
class Word2VecVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, model):
        self.model = model
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return np.vstack(X.apply(self._document_vector).values)
    
    def _document_vector(self, tokens):
        tokens = [t for t in tokens if t in self.model.wv]
        if len(tokens) == 0:
            return np.zeros(self.model.vector_size)
        return np.mean(self.model.wv[tokens], axis=0)

In [36]:
pipeline_w2v = Pipeline([
    ('tokens', preprocess_transformer),
    ('w2v', Word2VecVectorizer(model=w2v_model))
])

In [37]:
pipeline_custom_feats = Pipeline([
    ('custom_feats', custom_feature_transformer)
])

In [38]:
combined_features = FeatureUnion([
    ('w2v', pipeline_w2v),
    ('custom_feats', pipeline_custom_feats)
])

In [39]:
full_pipeline = Pipeline([
    ('features', combined_features),
    ('classifier', best_model)
])

In [40]:
# Fit the pipeline on raw ticket text, not on precomputed features
full_pipeline.fit(df['ticket_text'].loc[y_train_issue.index], y_issue_train_enc)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [41]:
joblib.dump(full_pipeline, '../pipeline/issue_type_pipeline.pkl')

['../pipeline/issue_type_pipeline.pkl']

In [42]:
loaded_pipeline = joblib.load('../pipeline/issue_type_pipeline.pkl')
label_encoder = joblib.load('../models/issue_encoder.pkl')

# Predict
preds = loaded_pipeline.predict(df['ticket_text'].loc[y_test_issue.index])
print(classification_report(y_test_issue, label_encoder.inverse_transform(preds)))



                    precision    recall  f1-score   support

    Account Access       0.85      0.85      0.85        27
   Billing Problem       0.86      0.92      0.89        26
   General Inquiry       0.92      0.96      0.94        25
Installation Issue       0.96      1.00      0.98        26
     Late Delivery       0.95      1.00      0.97        19
    Product Defect       1.00      0.77      0.87        22
        Wrong Item       0.81      0.81      0.81        21

          accuracy                           0.90       166
         macro avg       0.91      0.90      0.90       166
      weighted avg       0.91      0.90      0.90       166





In [43]:
pred = loaded_pipeline.predict(pd.Series(['Can you tell me more about the PhotoSnap Cam warranty? Also, is it available in red']))
final_preds = label_encoder.inverse_transform(pred)
print(final_preds)

['General Inquiry']


