In [None]:
%run ./imports.ipynb

# dataset - https://www.kaggle.com/datasets/abhishek14398/automatic-ticket-classification-dataset/data

In [3]:
path = "complaints.json"
open_path = open(path) 
read_data = json.load(open_path)
df=pd.json_normalize(read_data)

In [4]:
df = df[['_source.complaint_what_happened','_source.product','_source.sub_product']]
df = df.rename(columns={'_source.complaint_what_happened': 'complaint_text', '_source.product': 'category','_source.sub_product': 'sub_category'})

In [5]:
# lets merge the category and sub-category this will help us in deciding the topics after NMF modelling
df['category'] = df['category'] + '+' + df['sub_category']
df = df.drop(['sub_category'],axis= 1)

In [6]:
df[df['complaint_text']==''] = np.nan
df.complaint_text.isnull().sum()

57241

In [7]:
df = df[~df['complaint_text'].isnull()]
df.complaint_text.isnull().sum()

0

In [8]:
df.category.value_counts()

Credit card or prepaid card+General-purpose credit card or charge card                                         4918
Checking or savings account+Checking account                                                                   3788
Credit reporting, credit repair services, or other personal consumer reports+Credit reporting                  2011
Bank account or service+Checking account                                                                       1242
Mortgage+Conventional home mortgage                                                                             955
Mortgage+Conventional fixed mortgage                                                                            785
Mortgage+FHA mortgage                                                                                           496
Debt collection+Credit card debt                                                                                427
Money transfer, virtual currency, or money service+Domestic (US) money t

# text cleaning

In [15]:
%run ./preprocessing.ipynb

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [21]:
stopwords = nlp.Defaults.stop_words
def clean(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text) #special characters
    text = re.sub('\[.*\]','', text).strip() #brackets
    text = re.sub('\S*\d\S*\s*','', text).strip() #numbers
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    text = ' '.join(tokens)
    # return text.strip()
    return text.strip()

def lemmatizer(text):
    doc = nlp(text)
    sent = [token.lemma_ for token in doc if not token.text in set(stopwords)]
    return ' '.join(sent)

def spacy_tags_remove(text):
    doc = nlp(text)
    sent = [token.text for token in doc if token.tag_ == 'NN']
    return ' '.join(sent)

In [10]:
df['clean_text'] = df.complaint_text.apply(lambda x: clean(x))
df.head()

Unnamed: 0,complaint_text,category,clean_text
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,good morn name xxxx xxxx appreci could help pu...
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,upgrad xxxx xxxx card xxxx told agent upgrad a...
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...",chase card report xxxx howev fraudul applic su...
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...",xxxx tri book xxxx xxxx ticket came across off...
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,grand son give check deposit chase account fun...


In [16]:
df['lem'] =  df.clean_text.apply(lambda x: lemmatizer(x))
df.head()

Unnamed: 0,complaint_text,category,clean_text,lem
1,Good morning my name is XXXX XXXX and I apprec...,Debt collection+Credit card debt,good morn name xxxx xxxx appreci could help pu...,good morn xxxx xxxx appreci help stop chase ba...
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,Credit card or prepaid card+General-purpose cr...,upgrad xxxx xxxx card xxxx told agent upgrad a...,upgrad xxxx xxxx card xxxx tell agent upgrad a...
10,Chase Card was reported on XX/XX/2019. However...,"Credit reporting, credit repair services, or o...",chase card report xxxx howev fraudul applic su...,chase card report xxxx howev fraudul applic su...
11,"On XX/XX/2018, while trying to book a XXXX XX...","Credit reporting, credit repair services, or o...",xxxx tri book xxxx xxxx ticket came across off...,xxxx tri book xxxx xxxx ticket come offer appl...
14,my grand son give me check for {$1600.00} i de...,Checking or savings account+Checking account,grand son give check deposit chase account fun...,grand son check deposit chase account fund cle...


In [20]:
df_new = df[['complaint_text','lem','category']]
df_new.head()

Unnamed: 0,complaint_text,lem,category
1,Good morning my name is XXXX XXXX and I apprec...,good morn xxxx xxxx appreci help stop chase ba...,Debt collection+Credit card debt
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,upgrad xxxx xxxx card xxxx tell agent upgrad a...,Credit card or prepaid card+General-purpose cr...
10,Chase Card was reported on XX/XX/2019. However...,chase card report xxxx howev fraudul applic su...,"Credit reporting, credit repair services, or o..."
11,"On XX/XX/2018, while trying to book a XXXX XX...",xxxx tri book xxxx xxxx ticket come offer appl...,"Credit reporting, credit repair services, or o..."
14,my grand son give me check for {$1600.00} i de...,grand son check deposit chase account fund cle...,Checking or savings account+Checking account


In [23]:
df_new['after_POS'] =  df_new.lem.apply(lambda x: spacy_tags_remove(x))
df_new.head()

Unnamed: 0,complaint_text,lem,category,after_POS
1,Good morning my name is XXXX XXXX and I apprec...,good morn xxxx xxxx appreci help stop chase ba...,Debt collection+Credit card debt,write chase debt verif statement bank debt mai...
2,I upgraded my XXXX XXXX card in XX/XX/2018 and...,upgrad xxxx xxxx card xxxx tell agent upgrad a...,Credit card or prepaid card+General-purpose cr...,agent date agent inform order agent misl
10,Chase Card was reported on XX/XX/2019. However...,chase card report xxxx howev fraudul applic su...,"Credit reporting, credit repair services, or o...",chase card report consent extend credit verifi
11,"On XX/XX/2018, while trying to book a XXXX XX...",xxxx tri book xxxx xxxx ticket come offer appl...,"Credit reporting, credit repair services, or o...",ticket appli ticket appli reward card inform i...
14,my grand son give me check for {$1600.00} i de...,grand son check deposit chase account fund cle...,Checking or savings account+Checking account,son deposit chase account fund bank account pa...
