In [1]:
import json
import numpy as np
import os
import pandas as pd
import pickle
import re
import unidecode


from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from typing import Callable
from xgboost import XGBClassifier

In [2]:
# Change the current working directory to the pachage root
# That's step is due to the way settings.py is defined
root_path_list = os.getcwd().split("\\")[:-1]
root_path = os.path.join(root_path_list[0], os.sep, *root_path_list[1:])
os.chdir(root_path)
os.getcwd()

'd:\\Projects\\text-classification'

In [3]:
# User parameters

data_folder = os.path.join(os.getcwd(), "data", "01_raw")
data_file = "complaints.csv"

test_size=0.2
seed=0

max_num_samples_per_class = 500 # 20000

# Data Loading

In [4]:
# Load dataset

data_path = os.path.join(data_folder, data_file)
data = pd.read_csv(data_path, header=0, sep=',', quotechar='"')
data.dropna(inplace=True)

data.head()

Unnamed: 0,id,category,narrative
0,0,credit_card,purchase order day shipping amount receive pro...
1,1,credit_card,forwarded message date tue subject please inve...
2,2,retail_banking,forwarded message cc sent friday pdt subject f...
3,3,credit_reporting,payment history missing credit report speciali...
4,4,credit_reporting,payment history missing credit report made mis...


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162411 entries, 0 to 162420
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   id         162411 non-null  int64 
 1   category   162411 non-null  object
 2   narrative  162411 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.0+ MB


In [6]:
data.groupby('category').agg(num_complaints=('id','count')).reset_index().sort_values(by='num_complaints')

Unnamed: 0,category,num_complaints
4,retail_banking,13535
0,credit_card,15566
3,mortgages_and_loans,18990
2,debt_collection,23148
1,credit_reporting,91172


In [7]:
print(data.narrative[1])

forwarded message date tue subject please investigate comenity bank retailer card scam sent hello name scammed comenity bank credit card provider company childrens place new york forever victoria secret original credit comenity bank lower limit began charge overage fee along late fee began pay close attention card find limit also changed well incurring overage late fee reached company comenity bank stated would change credit limit original limit reached told summit payment account corrected comenity bank credit card impacted credit score plummeted negative status im currently paying price due corruption affected detrimental way debt due company charging overage fee well late fee even initial credit limit fluctuating tremendously company charge major fee account willing correct account nervous said attorney reason im reaching im employee company ruining credit plz help name contact info thank


In [8]:
# Split data into X and y
X = np.array(data.narrative)
y = np.array(data.category)

# Data Preparation

In [9]:
# Data cleaning

def text_cleaner(X, min_len_clean_sentence=5):

  for i in range(len(X)):
    sentence = X[i]
    sentence = sentence.lower() # lowercase text
    sentence = unidecode.unidecode(sentence) # remove accents
    sentence = re.sub(r"\W"," ",sentence) # remove non letters and numbers
    sentence = re.sub(r"\d"," ",sentence) # remove numbers
    sentence = re.sub(r"\b[a-z]{1}\b"," ",sentence,flags=re.I) # remove word size 1
    sentence = re.sub(r"\b[a-z]{2}\b"," ",sentence,flags=re.I) # remove word size 2
    sentence = re.sub(r"\s+"," ",sentence) # remove adittional space between words
    sentence = re.sub(r"^\s+","",sentence) # remove adittional space in the begining
    sentence = re.sub(r"\s+$","",sentence) # remove adittional space in the end
    X[i] = sentence
  return X

X_clean = text_cleaner(X)

In [10]:
# Data Filtering

# Filter out samples without a minimum number of characters

In [11]:
# Taget Label Encoding

def label_encoder(x):
    encoder = LabelEncoder()
    x_encoded = encoder.fit_transform(x)
    return x_encoded, encoder

y_encoded, tle = label_encoder(y)

y_encoded[:5]

array([0, 0, 4, 1, 1])

In [12]:
# Data spliting

def data_spliter(X, y, test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=seed)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = data_spliter(X_clean, y_encoded, test_size, seed)

np.unique(y_train, return_counts=True)


(array([0, 1, 2, 3, 4]),
 array([12453, 72937, 18518, 15192, 10828], dtype=int64))

In [13]:
# Class balacing

def random_under_sampler(X, y, max_num_samples_per_class,random_state=0):

    labels, counts = np.unique(y, return_inverse=False, return_counts=True)
    label_counts = dict(zip(range(len(labels)), [min(max_num_samples_per_class, count) for count in counts.tolist()]))
    rus = RandomUnderSampler(sampling_strategy=label_counts, random_state=random_state)

    try: 
        X_resampled, y_resampled = rus.fit_resample(X, y)
        return X_resampled, y_resampled
    except:
        X_resampled, y_resampled = rus.fit_resample(X.reshape(-1, 1), y)
        return X_resampled.reshape(-1, ), y_resampled

X_train_resampled, y_train_resampled = random_under_sampler(X_train, y_train, max_num_samples_per_class)

np.unique(y_train_resampled, return_counts=True)

(array([0, 1, 2, 3, 4]), array([500, 500, 500, 500, 500], dtype=int64))

# Training

In [14]:
# Create a text classification pipeline: vectorizer + classifier

pipe_steps = []

vectorizer_parameters = {
    "stop_words": "english", 
    "ngram_range": (1,1),
    "max_df": 0.75, 
    "min_df": 0.00, 
    "max_features": 7000 ,
    "binary": False, 
    "use_idf": True, 
    "norm": "l2",
}
vectorizer = TfidfVectorizer(**vectorizer_parameters)
pipe_steps.append(('vec', vectorizer))

classifier_parameters = {
    'n_estimators': 15,
    'max_depth': 5,
    'learning_rate': 0.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'colsample_bylevel': 1,
    'colsample_bynode': 1,
    'reg_lambda': 0.5,
    'reg_alpha': 0,
    'seed': seed
}
classifier = XGBClassifier(**classifier_parameters)
pipe_steps.append(('clf', classifier))

pipeline = Pipeline(pipe_steps)

In [15]:
# Train the text classification pipeline

pipeline.fit(X_train_resampled, y_train_resampled)

# Evaluation

In [16]:
# Make predictions for train and test datasets

y_train_resampled_pred = pipeline.predict(X_train_resampled)
y_test_pred = pipeline.predict(X_test)

In [17]:
# Compute metrics

def classifier_metrics_report_generator(y_true, y_pred, ndigits=4):
  report = {}
  report["accuracy"] = round(accuracy_score(y_true, y_pred), ndigits)
  report["average_precision"] = round(precision_score(y_true, y_pred, average="weighted"), ndigits)
  report["average_recall"] = round(recall_score(y_true, y_pred, average="weighted"), ndigits)
  report["average_f1"] = round(f1_score(y_true, y_pred, average="weighted"), ndigits)
  return report

metrics_report_train = classifier_metrics_report_generator(y_train_resampled_pred, y_train_resampled)
metrics_report_test = classifier_metrics_report_generator(y_test, y_test_pred)

print("Train metric report:\n", json.dumps(metrics_report_train, indent = 4))
print("Test metric report:\n", json.dumps(metrics_report_test, indent = 4))

Train metric report:
 {
    "accuracy": 0.9172,
    "average_precision": 0.9178,
    "average_recall": 0.9172,
    "average_f1": 0.9173
}
Test metric report:
 {
    "accuracy": 0.7527,
    "average_precision": 0.7885,
    "average_recall": 0.7527,
    "average_f1": 0.7606
}


# Inference

In [18]:
x_i = [X_test[10]]
y_i_encoded = pipeline.predict(x_i)
y_i = tle.inverse_transform(y_i_encoded)
y_i[0]

'credit_card'

In [19]:
# Create a class to make predictions easier

class TextClassPredictor:
    """Class to predict text category using trained models.

    Args:
        text_cleaner: function to clean the text.
        target_encoder: pickle of the target label encoder.
        classfier: pickle of the classifier.
    """

    def __init__(self, text_cleaner: Callable, target_encoder: pickle, classfier: pickle):
        self.text_cleaner = text_cleaner
        self.target_encoder = target_encoder
        self.classfier = classfier

    def predict(self, x: str):
        """Predict text category.

        Args:
            x: text to classify its category - narrative

        Returns:
            response: dictonary with predicted category and its probabiltiy.
        """
        x_clean = self.text_cleaner([x])
        y_encoded = self.classfier.predict(x_clean)
        y_pred = self.target_encoder.inverse_transform(y_encoded)[0]
        y_proba = max(self.classfier.predict_proba([x])[0])
        response = {"y_pred": y_pred, "y_proba": y_proba}
        return response


tcp = TextClassPredictor(text_cleaner=text_cleaner, target_encoder=tle, classfier=pipeline)

x_i = X_test[16]
y_i = tcp.predict(x_i)
y_i

{'y_pred': 'mortgages_and_loans', 'y_proba': 0.9571526}