In [38]:
import numpy as np
import pandas as pd
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

In [39]:
# Load data
df = pd.read_csv("../data/processed/labelled_texts.csv")
df = df[['complaint_what_happened', 'topic']]
df.head()

Unnamed: 0,complaint_what_happened,topic
0,Good morning my name is XXXX XXXX and I apprec...,Bank Account services
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card
2,Chase Card was reported on XX/XX/2019. However...,Credit card or prepaid card
3,"On XX/XX/2018, while trying to book a XXXX XX...",Credit card or prepaid card
4,my grand son give me check for {$1600.00} i de...,Bank Account services


In [40]:
# Set up map for label
label_to_id = {
    'Bank Account services': 0, 
    'Credit card or prepaid card': 1,
    'Mortgage/Loan': 2, 
    'Theft/Dispute Reporting': 3, 
    'Others': 4
}

# Util Functions for Text Preprocessing

In [41]:
# Functions
def remove_numbers_punctuations(text: str) -> str:
    """Remove numbers and punctuations from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed punctuations.
    """
    pattern = r'[^a-zA-Z]'
    return re.sub(pattern, ' ', text)

def remove_word_numbers(text: str) -> str:
    """Remove punctuations from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed punctuations.
    """
    pattern = r'\b\w*\d\w*\b'
    return re.sub(pattern, ' ', text)

def remove_confidential_information(text: str) -> str:
    """Remove confidential information from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed confidential information.
    """
    pattern = r'\b[Xx]{1,}\b'
    return re.sub(pattern, ' ', text)

def remove_extra_spaces(text: str) -> str:
    """Remove extra spaces or new lines from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed extra spaces or new lines.
    """
    pattern = r'\s+'
    return re.sub(pattern, ' ', text)

stop_words = stopwords.words('english')
def remove_stopwords(text: str) -> str:
    """Remove stop words from text

    Args:
        text (str): text

    Returns:
        str: text with stop words removed
    """
    tokens = word_tokenize(text)
    return " ".join([token for token in tokens if token not in stop_words])

# source: https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.
def get_wordnet_pos(tag: str) -> str:
    """Return wordnet constant value to do lemmatization based on their input word tag

    Args:
        tag (str): tag name

    Returns:
        str: constant value for wordnet lemmatization
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN
    
def lemmatize(text: str) -> str:
    """Perform lemmatization using WordNetLemmatizer

    Args:
        tokens (str): _description_

    Returns:
        str: _description_
    """
    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags])

def preprocess_texts(texts: list) -> list:
    processed_texts = list()
    vocabulary = set()
    max_length = -1
    for text in texts:
        try:
            text = remove_word_numbers(text)  # Remove word containing numbers
            text = remove_numbers_punctuations(text)  # Remove punctuations and numbers
            text = remove_confidential_information(text)  # Remove confidential information
            text = remove_extra_spaces(text)  # Remove extra spaces
            text = text.lower()  # Convert to lower case letters
            text = remove_stopwords(text)  # Remove stop words
            text = lemmatize(text)  # Perform lemmatization
            processed_texts.append(text)  # Append text to processed_texts list
            vocabulary.update(text.split())  # Append words to vocabulary
            max_length = max(max_length, len(text.split()))
        except Exception as e:
            print(e)
            print(text)
    
    # Word to index mapping
    vocabulary = sorted(vocabulary)
    return processed_texts, vocabulary, max_length

def encode_texts(texts: list, word_to_idx: dict):
    encoded_texts = []
    for text in texts:
        encoded_texts.append([word_to_idx.get(word, 0) for word in text.split()])
    return encoded_texts

# Dataset Class

In [80]:

# Set up map for label encoding
label_to_id = {
    'Bank Account services': 0, 
    'Credit card or prepaid card': 1,
    'Mortgage/Loan': 2, 
    'Theft/Dispute Reporting': 3, 
    'Others': 4
}


def encode_labels(labels: list):
    return [label_to_id[label] for label in labels]


class CustomerComplaintsDataset(Dataset):
    def __init__(self, texts: list, encoded_labels: list, vocabulary: set, max_length: int) -> None:
        super().__init__()
        self.texts = texts
        self.encoded_labels = encoded_labels
        self.word_to_idx = {word: idx+1 for idx, word in enumerate(vocabulary)}
        self.max_length = max_length
        
    def padding(self, tokens: list):
        if len(tokens) < self.max_length:
            zeros = list(np.zeros(self.max_length - len(tokens)))
            new = tokens + zeros
        else:
            new = tokens[:self.max_length]
        features = np.array(new)
        return features 
        
    
    def __len__(self):
        return len(self.encoded_labels)

    def __getitem__(self, idx):
        encoded_text = [self.word_to_idx.get(word, 0) for word in self.texts[idx].split()]
        encoded_text = self.padding(encoded_text)
        label = self.encoded_labels[idx]
        return torch.LongTensor(encoded_text), label
    
# from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)
    
    # Pad sequences to the same length
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, torch.tensor(labels, dtype=int)


# Combine it all
def text_preprocessing_pipeline(texts, labels):
    processed_texts, vocabulary, max_length = preprocess_texts(texts)
    encoded_labels = encode_labels(labels)
    dataset = CustomerComplaintsDataset(processed_texts, encoded_labels, vocabulary, max_length)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
    return dataloader, vocabulary, max_length

In [93]:
dataloader, vocabulary, max_length = text_preprocessing_pipeline(df['complaint_what_happened'].tolist()[:1000], df['topic'].tolist()[:1000])

In [94]:
texts, labels = next(iter(dataloader))

In [95]:
vocab_size = len(vocabulary) + 1
embedding_size = 32
num_class = 5

# Neural Network Class

In [96]:
class CustomerComplaintClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, num_classes):
        super(CustomerComplaintClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.flatten = torch.nn.Flatten()
        self.fc = torch.nn.Linear(embedding_size * max_length, num_classes)
        
    def forward(self, text):
        model = torch.nn.Sequential(
            self.embedding, 
            self.flatten, 
            self.fc
        )
        return model(text)


In [97]:
model = CustomerComplaintClassifier(vocab_size, embedding_size, num_class)

In [98]:
model.train()
lr = 1e-3
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

losses = []

for epoch in range(10):
    for texts, labels in dataloader:
        model.zero_grad()
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

In [99]:
loss

tensor(0.2867, grad_fn=<NllLossBackward0>)