In [None]:
import json 
import string
import numpy as np
import pandas as pd
import re, nltk, spacy, string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
# import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
# stopwords = nlp.Defaults.stop_words
from spacy.lang.en.stop_words import STOP_WORDS
# stop_words = set(stopwords.words('english'))
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint
from sklearn.decomposition import NMF
from sklearn import preprocessing
from nltk import ngrams
from nltk import FreqDist
from sklearn.decomposition import LatentDirichletAllocation as LDA
# nltk.download('all')
from nltk import pos_tag
import joblib
from textaugment import Wordnet
from sklearn.model_selection import train_test_split

In [None]:
# charts
import matplotlib.pyplot as plt
import seaborn as sns
import os
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline

In [None]:
#warnings
import warnings
warnings.filterwarnings('ignore')

#options
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 500)

# Functions

### 1. Data cleaning

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_recall_fscore_support

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

In [None]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub('\S*\d\S*\s*','', text)
    text = re.sub('\[.*\]','', text)

    return text

def remove_stopwords(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in STOP_WORDS]

    return filtered_tokens

def tokenize_text(text):
    # Tokenize the text into words
    tokens = word_tokenize(text)

    return tokens

### 2. Merge cleaning functions to one function

In [None]:
def preprocess_text(text):
    cleaned_text = clean_text(text)
    tokens = tokenize_text(cleaned_text)
    tokens_without_stopwords = remove_stopwords(tokens)
    preprocessed_text = ' '.join(tokens_without_stopwords)
    
    return preprocessed_text

### 3. Restore basic forms of words

# Importing data 
path = "data/complaints.json"
open_path = open(path) 
read_data = json.load(open_path)
df=pd.json_normalize(read_data)

# Importing Data
### dataset - https://www.kaggle.com/datasets/abhishek14398/automatic-ticket-classification-dataset/data

In [None]:
path = "data/complaints.json"
open_path = open(path) 
read_data = json.load(open_path)
df=pd.json_normalize(read_data)

In [None]:
#pick only columns which are needed
df = df[['_source.complaint_what_happened', '_source.issue', '_source.product', '_source.sub_product']]

#rename for normal names
df = df.rename(columns={'_source.complaint_what_happened': 'complaint',  '_source.issue' : 'issue', '_source.product': 'product','_source.sub_product': 'sub_product'})

# drop columns with blank description
df[df['complaint']==''] = np.nan
df = df[~df['complaint'].isnull()]

In [None]:
df['category'] = df['issue'] + ' / ' + df['product'] + ' / ' + df['sub_product']
df.drop(['issue', 'product', 'sub_product'], axis=1, inplace=True)

# text cleaning

data = df # replace to have possibility to load back original data
data['text_clean'] = data['complaint'].apply(lambda x: preprocess_text(x))
data['text_clean'] = data['text_clean'].apply(lambda x: lemmatize(x))
data['text_clean'] = data['text_clean'].apply(lambda x: remove_pos_tags(x))

data['text_clean'] = data['text_clean'].str.lower()
data['text_clean'] = data['text_clean'].str.replace('xxxx','')


# Topic Modelling
### in this dataset I have too much categories, so my plan is to change quantity of categories to 5 using NFM method.

tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')

dtm = tfidf.fit_transform(data['text_clean']) # document term metrix
feature_names = np.array(tfidf.get_feature_names_out())


### Vectorizer

In [None]:
tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
dtm = tfidf.fit_transform(data['text_clean']) # document term metrix
feature_names = np.array(tfidf.get_feature_names_out())

### NFM

In [None]:
from sklearn.preprocessing import normalize

num_topics = 5  # You can adjust the number of topics as needed
nmf_model = NMF(n_components=num_topics, random_state=42)
# nmf_model = NMF(n_components=num_topics)
nmf_matrix = nmf_model.fit_transform(dtm)

# Normalize the NMF matrix
nmf_matrix_normalized = normalize(nmf_matrix, axis=1)

# Assign topics to documents
data['topic'] = nmf_matrix_normalized.argmax(axis=1)


for topic in range(num_topics):
    topic_words_idx = nmf_model.components_[topic].argsort()[-15:][::-1]
    topic_words = [feature_names[i] for i in topic_words_idx]
    
    print(f"Top 15 words for Topic {topic}:\n")
    print(topic_words)
    print("\n")

### MAPPING AND SAVING DF

topic_mapping = {
    0: 'Banking and Account activities',
    1: 'Credit/debits Cards',
    2: 'Other',
    3: 'Reporting/information',
    4: 'Loans/Mortgages'
}

#Replace Topics with Topic Names
data['topic'] = data['topic'].map(topic_mapping)
data.to_csv('data/data.csv', index=False) # saving dataset

# Data Augmentation

topic_counts = data['topic'].value_counts() # get the topic for each category
max_count = topic_counts.max() # max items for main topic

augmented_data = []
wordnet_aug = Wordnet(v=True) # use Wordnet for synonims

# loop for creating additional data
for topic, count in topic_counts.items():
    if count < max_count:
        topic_data = data[data['topic'] == topic]
        samples_needed = max_count - count # find how many text data we need to add for each group
        
        # perform augmenation
        augmented_sentences = []
        while len(augmented_sentences) < samples_needed:
            augmented_sentence = wordnet_aug.augment(topic_data['text_clean'].sample().iloc[0])
            augmented_sentences.append(augmented_sentence)
        
        # create new dataframe with merged newly created samples and topics
        augmented_df = pd.DataFrame({'topic': [topic] * len(augmented_sentences), 'text_clean': augmented_sentences})
        
        # add samples from list to dataframe
        augmented_data.append(augmented_df)

# merge newly created dataframe with samples and oryginal data
augmented_data = pd.concat([data] + augmented_data, ignore_index=True)

# print the results
print("Orginal data counts :")
print(data.topic.value_counts())
print("\nAugmented data counts :")
print(augmented_data.topic.value_counts())

augmented_data.to_csv('data/augmented_data.csv', index=False) # saving dataset

# Machine Learning

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

df_scores = df = pd.DataFrame(columns=['Model', 'Training Score', 'Test Score']) 

In [None]:
def display_classification_results(model, X_train, y_train, X_test, y_test):
    global df_scores
    
    try:
        # Training data predictions
        y_train_pred = model.predict(X_train)
        
        # Test data predictions
        y_test_pred = model.predict(X_test)
        
        # Accuracy Scores
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        # Classification reports
        train_classification_report = classification_report(y_train, y_train_pred)
        test_classification_report = classification_report(y_test, y_test_pred)
    
    #inputs for some methods needs to be as array instead of string
    except:
        y_train_pred = model.predict(X_train.toarray())
        
        # Test data predictions
        y_test_pred = model.predict(X_test.toarray())

         # Accuracy Scores
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # Classification reports
        train_classification_report = classification_report(y_train, y_train_pred)
        test_classification_report = classification_report(y_test, y_test_pred)

    #print scores
    print("\nTraining Accuracy:", train_accuracy)
    print("\nTraining Classification Report:")
    print(train_classification_report)
    
    print("Testing Accuracy:", test_accuracy)
    print("\nTesting Classification Report:")
    print(test_classification_report)
    
    # Confusion Matrix
    unique_classes = np.unique(np.concatenate([y_train, y_test]))
    cm = confusion_matrix(y_test, y_test_pred, normalize='true')  # Normalize confusion matrix
    plt.figure(figsize=(15, 6))
    
    # Change color map to Greens
    sns.heatmap(cm, annot=True, fmt=".2f", cmap="Greens", xticklabels=unique_classes, yticklabels=unique_classes)
    
    plt.title('Normalized Confusion Matrix')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.show()


In [None]:
# import dataset
data = pd.read_csv('data/augmented_data.csv')
data['text_clean'] = data['text_clean'].fillna('')
data[data['text_clean']==''] = np.nan
data = data[~data['text_clean'].isnull()]

training_data = data[['text_clean','topic']]
X = training_data['text_clean']
y = training_data['topic']


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
cv = CountVectorizer()
X_vec = cv.fit_transform(X)
tfidf_t = TfidfTransformer()
X_tfidf = tfidf_t.fit_transform(X_vec)
joblib.dump(cv, 'pre-trained_models/cv.joblib')
joblib.dump(tfidf_t, 'pre-trained_models/tfidf.joblib')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=40, stratify=y)

### Logistic Regression Classificator

In [None]:
log_reg = LogisticRegression(random_state=40,solver='liblinear')
log_reg.fit(X_train,y_train)
joblib.dump(log_reg, 'pre-trained_models/logistic_regression.joblib')
display_classification_results(log_reg, X_train, y_train, X_test, y_test,)