# 1. Imports

### Data Analysis

In [52]:
import json 
import string
import re
import nltk
import spacy
import numpy as np
import pandas as pd

### Preprocessing

In [53]:
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

### Machine Learning

In [54]:
from sklearn.decomposition import NMF
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize

### Other

In [55]:
import random
import os
import warnings
warnings.filterwarnings('ignore')

# 2. Functions

### Data cleaning

In [56]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub('\S*\d\S*\s*','', text)
    text = re.sub('\[.*\]','', text)

    return text

def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in STOP_WORDS]

    return filtered_tokens

def tokenize_text(text):
    tokens = word_tokenize(text)

    return tokens

### Merge cleaning functions to one function

In [57]:
def preprocess_text(text):
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    tokens_without_stopwords = remove_stopwords(tokens)
    preprocessed_text = ' '.join(tokens_without_stopwords)
    
    return preprocessed_text

### Restore basic forms of words

In [58]:
def lemmatize(text):
    doc = nlp(text)
    sent = [token.lemma_ for token in doc if token.text not in STOP_WORDS]

    return ' '.join(sent)

def remove_pos_tags(text):
    doc = nlp(text)
    sent = [token.text for token in doc if token.tag_ == 'NN']

    return ' '.join(sent)

### Data Augemntation

In [59]:
# get synonyms with WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().lower())
    return list(synonyms)

# perform aAugmentation
def augment_text(sentence):
    words = sentence.split()
    augmented_words = []
    for word in words:
        synonyms = get_synonyms(word)
        if synonyms:
            augmented_words.append(random.choice(synonyms))
        else:
            augmented_words.append(word)
    return ' '.join(augmented_words)

### Classification Report

### 3. Process

### dataset - https://www.kaggle.com/datasets/abhishek14398/automatic-ticket-classification-dataset/data

In [60]:
path = "data/complaints.json"
open_path = open(path) 
read_data = json.load(open_path)
df=pd.json_normalize(read_data)

In [61]:
#pick only columns which are needed
df = df[['_source.complaint_what_happened', '_source.issue', '_source.product', '_source.sub_product']]

#rename for normal names
df = df.rename(columns={'_source.complaint_what_happened': 'complaint',  '_source.issue' : 'issue', '_source.product': 'product','_source.sub_product': 'sub_product'})

# drop columns with blank description
df[df['complaint']==''] = np.nan
df = df[~df['complaint'].isnull()]


In [62]:
# creating new column with merged 3 columns for category and drop previous columns
df['category'] = df['issue'] + ' / ' + df['product'] + ' / ' + df['sub_product']
df.drop(['issue', 'product', 'sub_product'], axis=1, inplace=True)

# text cleaning

In [63]:
data = df
data['text_clean'] = data['complaint'].apply(lambda x: preprocess_text(x))
data['text_clean'] = data['text_clean'].apply(lambda x: lemmatize(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_pos_tags(x))

data['text_clean'] = data['text_clean'].str.lower()
data['text_clean'] = data['text_clean'].str.replace('xxxx','')


# Topic Modelling

### Vectorizer

In [64]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
dtm = tfidf.fit_transform(data['text_clean']) # document term metrix
feature_names = np.array(tfidf.get_feature_names_out())

### NFM

In [65]:
from sklearn.preprocessing import normalize

num_topics = 5  # You can adjust the number of topics as needed
nmf_model = NMF(n_components=num_topics, random_state=42)
# nmf_model = NMF(n_components=num_topics)
nmf_matrix = nmf_model.fit_transform(dtm)

# Normalize the NMF matrix
nmf_matrix_normalized = normalize(nmf_matrix, axis=1)

# Assign topics to documents
data['topic'] = nmf_matrix_normalized.argmax(axis=1)


for topic in range(num_topics):
    topic_words_idx = nmf_model.components_[topic].argsort()[-15:][::-1]
    topic_words = [feature_names[i] for i in topic_words_idx]
    
    print(f"Top 15 words for Topic {topic}:\n")
    print(topic_words)
    print("\n")

Top 15 words for Topic 0:

['account', 'check', 'money', 'bank', 'deposit', 'fund', 'day', 'branch', 'transfer', 'number', 'business', 'transaction', 'chase', 'customer', 'claim']


Top 15 words for Topic 1:

['credit', 'card', 'report', 'inquiry', 'account', 'score', 'company', 'information', 'chase', 'limit', 'application', 'debt', 'letter', 'year', 'balance']


Top 15 words for Topic 2:

['payment', 'balance', 'month', 'pay', 'statement', 'fee', 'time', 'day', 'mortgage', 'date', 'credit', 'year', 'auto', 'account', 'error']


Top 15 words for Topic 3:

['charge', 'card', 'dispute', 'fee', 'transaction', 'purchase', 'merchant', 'claim', 'service', 'refund', 'fraud', 'time', 'email', 'statement', 'balance']


Top 15 words for Topic 4:

['loan', 'mortgage', 'modification', 'home', 'property', 'year', 'letter', 'document', 'rate', 'request', 'time', 'foreclosure', 'refinance', 'information', 'sale']




In [66]:
data.topic.value_counts()

topic
0    5139
3    4924
1    4817
4    3818
2    2374
Name: count, dtype: int64

### MAPPING AND SAVING DF

In [67]:
topic_mapping = {
    0: 'Banking and Account activities',
    1: 'Credit/debits Cards',
    2: 'Other',
    3: 'Reporting/information',
    4: 'Loans/Mortgages'
}

#Replace Topics with Topic Names
data['topic'] = data['topic'].map(topic_mapping)

In [68]:
data.to_csv('data/data.csv', index=False) # saving dataset

# Data Augmentation

In [69]:

topic_counts = data['topic'].value_counts()
max_count = topic_counts.max() # max items for main topic


# Loop for augmenting data
augmented_data = []
for topic, count in topic_counts.items():
    if count < max_count:
        topic_data = data[data['topic'] == topic]
        samples_needed = max_count - count
        
        # perform augmenation
        augmented_sentences = []
        while len(augmented_sentences) < samples_needed:
            augmented_sentence = augment_text(topic_data['text_clean'].sample().iloc[0])
            augmented_sentences.append(augmented_sentence)
        
        # create new dataframe with merged newly created samples and topics
        augmented_df = pd.DataFrame({'topic': [topic] * len(augmented_sentences), 'text_clean': augmented_sentences})

        # add samples from list to dataframe
        augmented_data.append(augmented_df)

# Merge augmented data with original data
augmented_data = pd.concat([data] + augmented_data, ignore_index=True)


In [70]:
augmented_data.to_csv('data/augmented_data.csv', index=False) # saving dataset

# Machine Learning

In [71]:
training_data = augmented_data[['text_clean','topic']]
X = training_data['text_clean']
y = training_data['topic']

In [72]:
cv = CountVectorizer()
X_vec = cv.fit_transform(X)

tfidf_t = TfidfTransformer()
X_tfidf = tfidf_t.fit_transform(X_vec)

In [73]:
joblib.dump(cv, 'pre-trained_models/cv.joblib')
joblib.dump(tfidf_t, 'pre-trained_models/tfidf.joblib')

['pre-trained_models/tfidf.joblib']

In [74]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=40, stratify=y)

### Logistic Regression Classificator

In [75]:
log_reg = LogisticRegression(random_state=40,solver='liblinear')
log_reg.fit(X_train,y_train)
joblib.dump(log_reg, 'pre-trained_models/logistic_regression.joblib')

['pre-trained_models/logistic_regression.joblib']

In [78]:
y_train_pred = log_reg.predict(X_train)
y_test_pred = log_reg.predict(X_test)
        
train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(train_accuracy)
print(test_accuracy)