# Imports

In [2]:
import json 
import numpy as np
import pandas as pd
import re, nltk, spacy, string
import en_core_web_sm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nlp = spacy.load("en_core_web_sm")
# stopwords = nlp.Defaults.stop_words
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint
from sklearn.decomposition import NMF
from sklearn import preprocessing
# nltk.download('all')

In [3]:
# charts
import matplotlib.pyplot as plt
import seaborn as sns
import os
from plotly.offline import plot
import plotly.graph_objects as go
import plotly.express as px
%matplotlib inline

In [4]:
#warnings
import warnings
warnings.filterwarnings('ignore')

#options
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from pprint import pprint

# Functions

### 1. Data cleaning

In [5]:
def remove_special_characters(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    clean_text = re.sub('\S*\d\S*\s*', '', clean_text)
    
    return clean_text

def remove_punctuation(text):
    punctuation = text.translate(str.maketrans('', '', string.punctuation))
    return punctuation

def convert_to_lowercase(text):
    text_lower = text.lower()
    return text_lower

def tokenization(text):
    tokens  = word_tokenize(text)

    return ' '.join(tokens)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)
    

def lem_text(text):
    doc = nlp(text)
    lemma = ' '.join([token.lemma_ for token in doc])
    
    return lemma

def replace(text):
    replaced = text.replace('xxxx', '').replace('-PRON-', '')

    return replaced



### 2. Merge cleaning functions for one function

In [28]:
def preprocessing_data(text):
    
    text = remove_special_characters(text) #pass
    text = remove_punctuation(text) #pass
    text = convert_to_lowercase(text) #pass
    text = tokenization(text) #pass
    text = remove_stopwords(text) #pass
    text = lem_text(text) #pass
    text = replace(text) #pass
    return text

### 3. Restore basic forms of words

In [6]:
def lemmatization(text):
    doc = nlp(text)
    lem_tokens = [token.lemma_ for token in doc if token.text.lower() not in STOP_WORDS]
    lem_text = ' '.join(lem_tokens)
    
    return lem_text

def stemming(text):
    stemmer = PorterStemmer()
    words = word_tokenize(text)
    stem_words = [stemmer.stem(word) for word in words if word.text.lower() not in STOP_WORDS]
    stem_text = ' '.join(stem_words)

    return stem_text

def remove_POS_tags(text):
    doc = nlp(text)
    result = [token.text for token in doc if token.tag_ == 'NN']  # check for nouns
    return result

In [22]:
data.head()

Unnamed: 0,complaint_text,category,clean
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,"[morning, name, stop, cardmember, debt, verifi..."
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,"[XXXX, XXXX, card, agent, anniversary, date, a..."
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...","[application, identity, consent, credit, ident..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...","[XXXX, ticket, offer, ticket, card, informatio..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,"[son, chase, account, fund, chase, bank, accou..."


In [23]:
data = df
data['clean1'] = data['complaint_text'].apply(lambda x: lemmatization(x))
data['clean2'] = data['complaint_text'].apply(lemmatization)

In [17]:
data.head()

Unnamed: 0,complaint_text,category,clean
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,Good morning name XXXX XXXX appreciate could h...
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,upgraded XXXX XXXX card XX/XX/2018 told agent ...
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...","Chase Card reported XX/XX/2019 . However , fra..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...","XX/XX/2018 , trying book XXXX XXXX ticket , ca..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,grand son give check { $ 1600.00 } deposit cha...


# Importing data 

### dataset - https://www.kaggle.com/datasets/abhishek14398/automatic-ticket-classification-dataset/data

In [7]:
path = "complaints.json"
open_path = open(path) 
read_data = json.load(open_path)
df=pd.json_normalize(read_data)

In [8]:
df = df[['_source.complaint_what_happened','_source.product','_source.sub_product']]
df = df.rename(columns={'_source.complaint_what_happened': 'complaint_text', '_source.product': 'category','_source.sub_product': 'sub_category'})

# data modelling
df['category'] = df['category'] + '+' + df['sub_category']
df = df.drop(['sub_category'],axis= 1)
df[df['complaint_text']==''] = np.nan
df = df[~df['complaint_text'].isnull()]

# text cleaning

In [30]:
data.head()

Unnamed: 0,complaint_text,category,clean
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,"[morning, name, appreciate, bank, service, wri..."
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,"[card, agent, upgrade, anniversary, date, agen..."
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...","[chase, card, report, application, submit, ide..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...","[book, ticket, offer, ticket, reward, card, in..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,"[son, check, deposit, chase, account, fund, ac..."


In [9]:

data = df # replace to have possibility to load back original data
data['clean'] = data['complaint_text'].apply(lambda x: preprocessing_data(x)) # new column with preprocessed data
data['clean'] = data['clean'].apply(lambda x: lemmatization(x))
# data['clean'] = data['complaint_text'].apply(preprocessing_data) # new column with preprocessed data
# data['clean'] = data['complaint_text'].apply(lemmatization)

In [None]:
# Write your code here to visualise the data according to the 'Complaint' character length

plt.figure(figsize=(10,6))
# doc_lens = [len(d) for d in data.clean]
doc_lens = [len(d) for d in data.clean]
plt.hist(doc_lens, bins = 50)

TypeError: object of type 'float' has no len()

<Figure size 1000x600 with 0 Axes>

In [27]:
data.head()

Unnamed: 0,complaint_text,category,clean
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,"[morning, name, appreciate, bank, service, wri..."
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,"[card, agent, upgrade, anniversary, date, agen..."
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...","[chase, card, report, application, submit, ide..."
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...","[book, ticket, offer, ticket, reward, card, in..."
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,"[son, check, deposit, chase, account, fund, ac..."


# Topic Modelling
### in this dataset I have too much categories, so my plan is to change quantity of categories to 5.
### This can be done by NFM or LDA
###
###
###

##### vectorizer - TFIDF

In [None]:
# vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')
# vectorizer_matrix = vectorizer.fit_transform(data['clean'])
# vectorizer.get_feature_names_out()[:10]

#
#
#
#
#
# NFM