In [8]:
import numpy as np
import pandas as pd
import re
from nltk import word_tokenize, pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer

In [5]:
# Load data
df = pd.read_csv("../data/processed/labelled_texts.csv")
df = df.loc[:100, ['complaint_what_happened', 'topic']]
df.head()

Unnamed: 0,complaint_what_happened,topic
0,Good morning my name is XXXX XXXX and I apprec...,Bank Account services
1,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card
2,Chase Card was reported on XX/XX/2019. However...,Credit card or prepaid card
3,"On XX/XX/2018, while trying to book a XXXX XX...",Credit card or prepaid card
4,my grand son give me check for {$1600.00} i de...,Bank Account services


In [6]:
# Set up map for label
label_to_id = {
    'Bank Account services': 0, 
    'Credit card or prepaid card': 1,
    'Mortgage/Loan': 2, 
    'Theft/Dispute Reporting': 3, 
    'Others': 4
}

In [None]:
# source: https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN
       
def lemmatize(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    lemmatized_sentence = ' '.join(lemmatized_words)
    return lemmatized_sentence

In [None]:
text = df.loc[0, 'complaint_what_happened']
print(f"Previous text: {text}")

stop_words = stopwords.words('english')
# Text preprocessing
text = re.sub(r'[^a-zA-z]', ' ', text) # Remove punctuations and numbers
text = re.sub(r'\b[Xx]{1,}\b', ' ', text) # Remove confidential information
text = re.sub(r'\s+', ' ', text) # Remove extra spaces
text = text.lower() # Convert to lower case letters
tokens = text.split()
tokens = [token for token in tokens if token not in stop_words]
text = ' '.join(tokens)
text = lemmatize(text)

print(f"Preprocessed text: {text}")

# Dataset Class

In [9]:
tokenizer = get_tokenizer('basic_english')

In [14]:
# Functions
def remove_numbers_punctuations(text: str) -> str:
    """Remove numbers and punctuations from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed punctuations.
    """
    pattern = r'[^a-zA-Z]'
    return re.sub(pattern, ' ', text)

def remove_word_numbers(text: str) -> str:
    """Remove punctuations from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed punctuations.
    """
    pattern = r'\b\w*\d\w*\b'
    return re.sub(pattern, ' ', text)

def remove_confidential_information(text: str) -> str:
    """Remove confidential information from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed confidential information.
    """
    pattern = r'\b[Xx]{1,}\b'
    return re.sub(pattern, ' ', text)

def remove_extra_spaces(text: str) -> str:
    """Remove extra spaces or new lines from a text.
    
    Args:
        text (str): text.
    Returns:
        str: text with removed extra spaces or new lines.
    """
    pattern = r'\s+'
    return re.sub(pattern, ' ', text)

stop_words = stopwords.words('english')
def remove_stopwords(tokens: list) -> list:
    return [token for token in tokens if token not in stop_words]

# source: https://www.ibm.com/topics/stemming-lemmatization#:~:text=The%20practical%20distinction%20between%20stemming,be%20found%20in%20the%20dictionary.
def get_wordnet_pos(tag: str) -> str:
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:         
        return wordnet.NOUN
       
def lemmatize(tokens: list) -> list:
    pos_tags = pos_tag(tokens)
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in pos_tags]

def preprocess_texts(texts: list) -> list:
    processed_texts = []
    vocabulary = set()
    for text in texts:
        try:
            text = remove_word_numbers(text) # Remove word containing numbers
            text = remove_numbers_punctuations(text) # Remove punctuations and numbers
            text = remove_confidential_information(text) # Remove confidential information
            text = remove_extra_spaces(text) # Remove extra spaces
            text = text.lower() # Convert to lower case letters
            tokens = tokenizer(text)
            tokens = remove_stopwords(tokens) # Remove stop words
            tokens = lemmatize(tokens) # Perform lemmatization
            text = ' '.join(tokens)
            processed_texts.append(text) # Append list
            vocabulary.update(text.split())
        except Exception as e:
            print(e)
            print(text)
    
    # Word to index mapping
    sorted(vocabulary)
    word_to_idx = {word: idx+1 for idx, word in enumerate(vocabulary)} 
    return processed_texts, word_to_idx

def encode_texts(texts: list, word_to_idx: dict):
    encoded_texts = []
    for text in texts:
        encoded_texts.append([word_to_idx.get(word, 0) for word in text.split()])
    return encoded_texts

# Set up map for label encoding
label_to_id = {
    'Bank Account services': 0, 
    'Credit card or prepaid card': 1,
    'Mortgage/Loan': 2, 
    'Theft/Dispute Reporting': 3, 
    'Others': 4
}
from sklearn.preprocessing import OneHotEncoder
def encode_labels(labels: list):
    return [label_to_id[label] for label in labels]


class CustomerComplaintsDataset(Dataset):
    def __init__(self, encoded_texts: list, encoded_labels: list) -> None:
        super().__init__()
        self.encoded_texts = encoded_texts
        self.encoded_labels = encoded_labels
    
    def __len__(self):
        return len(self.encoded_texts)

    def __getitem__(self, idx):
        text = torch.LongTensor(self.encoded_texts[idx])
        label = torch.Tensor([int(self.encoded_labels[idx])])
        return text, label
    
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    texts, labels = zip(*batch)
    
    # Pad sequences to the same length
    padded_texts = pad_sequence(texts, batch_first=True, padding_value=0)
    return padded_texts, labels


# Combine it all
def text_preprocessing_pipeline(texts, labels):
    processed_texts, word_to_idx = preprocess_texts(texts)
    encoded_texts = encode_texts(processed_texts, word_to_idx)
    encoded_labels = encode_labels(labels)
    dataset = CustomerComplaintsDataset(encoded_texts, encoded_labels)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
    return dataloader, word_to_idx

In [15]:
dataloader, word_to_idx = text_preprocessing_pipeline(df['complaint_what_happened'].tolist(), df['topic'].tolist())

In [28]:
vocab_size = len(word_to_idx)
embedding_size = 64
num_class = 5

# Neural Network Class

In [29]:
class CustomerComplaintClassifier(torch.nn.Module):
    def __init__(self, vocab_size, embedding_size, num_class):
        super(CustomerComplaintClassifier, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_size)
        self.fc = torch.nn.Linear(embedding_size, num_class)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output = self.fc(embedded)
        return output


In [30]:
model = CustomerComplaintClassifier(vocab_size, embedding_size, num_class)

In [36]:
model.train()
lr = 1e-3
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(10):
    for text, label in dataloader:
        print(text.shape)
        # optimizer.zero_grad()
        # predicted_label = model(text)
        # loss = criterion(predicted_label, label)
        # optimizer.step()
        # break
    break

torch.Size([64, 758])
torch.Size([37, 685])


In [35]:
text.shape

torch.Size([64, 706])

In [32]:
model(text)

IndexError: index out of range in self

In [None]:
label

In [None]:
for i, (text, label) in enumerate(dataloader):
    print(i, text)
    break


In [37]:
import torch
from torchtext.datasets import AG_NEWS

train_iter = iter(AG_NEWS(split="train"))




ModuleNotFoundError: Package `torchdata` not found. Please install following instructions at https://github.com/pytorch/data