In [30]:
import numpy as np
import pandas as pd
import re
import string
import time
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [2]:
# Load data
df = pd.read_csv("../data/processed/labelled_texts.csv")
df = df[['complaint_what_happened', 'topic']]

# Util Functions for Text Preprocessing

In [7]:
# Functions
def remove_punctuations(text: str) -> str:
    """Remove punctuations from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed punctuations.
    """
    pattern = f'[{re.escape(string.punctuation)}]'
    return re.sub(pattern, ' ', text)

def remove_numbers(text: str) -> str:
    """Remove numbers from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with numbers punctuations.
    """
    pattern = r'[0-9]'
    return re.sub(pattern, ' ', text)

def remove_confidential_information(text: str) -> str:
    """Remove confidential information from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed confidential information.
    """
    pattern = r'\b[Xx]{1,}\b'
    return re.sub(pattern, ' ', text)

def remove_extra_spaces(text: str) -> str:
    """Remove extra spaces or new lines from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed extra spaces or new lines.
    """
    pattern = r'\s+'
    return re.sub(pattern, ' ', text)

tokenizer = get_tokenizer('basic_english')
stop_words = stopwords.words('english')
def remove_stopwords(text: str) -> str:
    """Remove stop words from text

    Args:
        text (str): text

    Returns:
        str: text with stop words removed
    """
    tokens = tokenizer(text)
    return ' '.join([token for token in tokens if token not in stop_words])

# source: https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.
def get_wordnet_pos(tag: str) -> str:
    """Return wordnet constant value to do lemmatization based on their input word tag

    Args:
        tag (str): tag name

    Returns:
        str: constant value for wordnet lemmatization
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN
    
def lemmatize(text: str) -> str:
    """Perform lemmatization using WordNetLemmatizer

    Args:
        tokens (str): _description_

    Returns:
        str: _description_
    """
    tokens = tokenizer(text)
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()  
    return ' '.join([lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags])


In [32]:
# Create text preprocessing pipeline
def process_text(text: str) -> str:
    text = text.lower()
    
    # Text cleaning: 
    text = remove_numbers(text)
    text = remove_punctuations(text)
    text = remove_confidential_information(text)
    text = remove_extra_spaces(text)

    # Text transformation:
    text = remove_stopwords(text)
    text = lemmatize(text)
    return text

In [24]:
# Text Preprocessing
start = time.perf_counter()
display(list(map(process_text, df['complaint_what_happened'].tolist())))
stop = time.perf_counter()
print(f"Time it took was {stop - start:.2f} s")

['good morning name appreciate could help put stop chase bank cardmember service write chase ask debt verification send statement acceptable ask bank validate debt instead receive mail every month attempt collect debt right know information consumer chase account thanks advance help',
 'upgraded card tell agent upgrade anniversary date would change turned agent give wrong information order upgrade account change anniversary date without consent record agent mislead',
 'chase card report however fraudulent application submit identity without consent fraudulently obtain service extend credit without verify identity applicant',
 'try book ticket come across offer apply towards ticket apply reward card put information offer within less minute notified via screen decision could make immediately contact referred chase bank immediately contact chase bank within minute get notification screen tell chase representative spoke application deny could state ask information offer explain even approv

Time it took was 106.65 s


# Dataset Class

In [29]:
torch.rand((1, 21))

tensor([[0.5238, 0.8898, 0.6027, 0.1141, 0.7992, 0.9660, 0.4937, 0.2413, 0.0340,
         0.8421, 0.5325, 0.6562, 0.7827, 0.2183, 0.7477, 0.7465, 0.9991, 0.8233,
         0.5981, 0.4954, 0.3899]])

In [158]:
# Set up mapping for label
label_to_ix = {
    'Bank Account services': 0, 
    'Credit card or prepaid card': 1,
    'Mortgage/Loan': 2, 
    'Theft/Dispute Reporting': 3, 
    'Others': 4
}

ix_to_label = {
    0: 'Bank Account services', 
    1: 'Credit card or prepaid card',
    2: 'Mortgage/Loan', 
    3: 'Theft/Dispute Reporting', 
    4: 'Others'
}

def encode_label(label: list):
    return label_to_ix[label]

def encode_text(text: str, word_to_ix: dict):
    return [word_to_ix.get(word, 0) for word in tokenizer(text)]


class CustomerComplaintsDataset(Dataset):
    def __init__(self, encoded_texts: list, encoded_labels: list, max_length: int) -> None:
        super().__init__()
        self.encoded_texts = encoded_texts
        self.encoded_labels = encoded_labels
        self.max_length = max_length
        
    def padding(self, tokens: list, max_length: int):
        if len(tokens) < max_length:
            zeros = list(np.zeros(max_length - len(tokens)))
            new = tokens + zeros
        else:
            new = tokens[:max_length]
        features = np.array(new)
        return features 
    
    def __len__(self):
        return len(self.encoded_labels)

    def __getitem__(self, idx):
        encoded_text = self.padding(self.encoded_texts[idx], self.max_length)
        label = self.encoded_labels[idx]
        return torch.LongTensor(encoded_text), label
    
# from torch.nn.utils.rnn import pad_sequence

# def collate_fn(batch):
#     texts, labels = zip(*batch)
    
#     # Pad sequences to the same length
#     padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
#     return padded_texts, torch.tensor(labels, dtype=int)


# Combine it all
def text_preprocessing_pipeline(texts, labels):
    # Process the text
    processed_texts = list(map(process_text, texts))
    
    # Generate vocabulary: 
    vocabulary = set(' '.join(processed_texts).split())
    word_to_ix = {vocab: ix+1 for ix, vocab in enumerate(vocabulary)}
    
    # Calculate maximum length
    max_length = -1  
    for text in processed_texts:
        max_length = max(max_length, len(tokenizer(text)))
    

    # Encode texts:
    encoded_texts = [encode_text(text, word_to_ix) for text in processed_texts] 
    # Encode labels:
    encoded_labels = [encode_label(label) for label in labels]
    dataset = CustomerComplaintsDataset(encoded_texts, encoded_labels, max_length)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
    return dataloader, vocabulary, max_length 

In [159]:
dataloader, vocabulary, max_length = text_preprocessing_pipeline(df['complaint_what_happened'].tolist()[:500], df['topic'].tolist()[:500])

In [160]:
for features, labels in dataloader:
    print(features.shape)
    print(labels.shape)

torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([64, 2843])
torch.Size([64])
torch.Size([52, 2843])
torch.Size([52])


In [161]:
vocab_size = len(vocabulary) + 1
embedding_size = 128
num_classes = len(df['topic'].unique())

# Neural Network Class

In [162]:
class CustomerComplaintClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, num_classes):
        super(CustomerComplaintClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.flatten = torch.nn.Flatten()
        self.fc = torch.nn.Linear(embedding_size * max_length, num_classes)
        
    def forward(self, text):
        model = torch.nn.Sequential(
            self.embedding, 
            self.flatten, 
            self.fc
        )
        return model(text)

In [163]:
model = CustomerComplaintClassifier(vocab_size, embedding_size, num_classes)

In [164]:
lr = 1e-3
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

losses = []

model.train()
for epoch in range(10):
    total_loss = 0
    for texts, labels in dataloader:
        # Reset gradients:
        model.zero_grad()
        optimizer.zero_grad()

        # Produce model output
        outputs = model(texts)

        # loss model
        loss = criterion(outputs, labels)
        
        # Backpropagation
        loss.backward()

        # update parameters
        optimizer.step()
        total_loss += loss.item()
    losses.append(total_loss)

In [165]:
losses

[2055.0043954849243,
 906.358283996582,
 625.0599822998047,
 395.15110969543457,
 324.28125190734863,
 220.6447949409485,
 100.28202056884766,
 95.68417572975159,
 142.01598179340363,
 39.23400700092316]