In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import datetime

In [2]:
f = open('../data/complaints-2021-05-14_08_16_.json') 
  
# returns JSON object as a dictionary 
data = json.load(f)
df = pd.json_normalize(data)

In [None]:
df.head()

In [4]:
df.columns = ['index', 'type', 'id', 'score', 'tags', 'zip_code','complaint_id', 'issue', 'date_received',
       'state', 'consumer_disputed', 'product','company_response', 'company', 'submitted_via',
       'date_sent_to_company', 'company_public_response','sub_product', 'timely',
       'complaint_what_happened', 'sub_issue','consumer_consent_provided']

In [None]:
df[df.loc[:, 'complaint_what_happened'] == ''] = np.nan
df = df[~df['complaint_what_happened'].isnull()]
df.reset_index()

In [None]:
df.shape

In [7]:
import re


def clean_text(sent):
    sent = sent.lower() # Text to lowercase
    pattern = '[^\w\s]' # Removing punctuation
    sent = re.sub(pattern, '', sent) 
    pattern = '\w*\d\w*' # Removing words with numbers in between
    sent = re.sub(pattern, '', sent) 
    return sent

df['complaint_what_happened'] = df['complaint_what_happened'].astype(str)
df_clean = pd.DataFrame(df['complaint_what_happened'].apply(clean_text))

In [None]:
df_clean

In [9]:
# import re, nltk, spacy, string


# nlp = spacy.load("en_core_web_sm")

# def lemmmatize_text(text):
#     sent = []
#     doc = nlp(text)
#     for token in doc:
#         sent.append(token.lemma_)
#     return " ".join(sent)

# df_clean['complaint_lemmatized'] = df_clean['complaint_what_happened'].apply(lemmmatize_text)

In [10]:
import pickle

# pickle.dump(df_clean, open("./artifacts/df_clean_after_encore.pkl", "wb"))

with open("./artifacts/df_clean_after_encore.pkl", "rb") as file:
    df_clean = pickle.load(file)

In [None]:
!python -m textblob.download_corpora

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

In [13]:
from textblob import TextBlob

def get_POS_tags(text):
    sent = []
    blob = TextBlob(text)
    sent = [word for (word,tag) in blob.tags if tag=='NN']
    return " ".join(sent)

df_clean['complaint_POS_removed'] = df_clean['complaint_lemmatized'].apply(get_POS_tags)

In [14]:
import pickle

pickle.dump(df_clean, open("./artifacts/df_clean_after_pos.pkl", "wb"))

# with open("./artifacts/df_clean_after_pos.pkl", "rb") as file:
#     df_clean = pickle.load(file)

In [None]:
df_clean

In [17]:
df_clean['Complaint_clean'] = df_clean['complaint_POS_removed'].str.replace('-PRON-', '')

In [18]:
df_clean['Complaint_clean'] = df_clean['Complaint_clean'].str.replace('xxxx','')

In [None]:
df_clean['Complaint_clean'] = df_clean['Complaint_clean'].str.replace('xxxx','')

In [None]:
df_clean.head(10)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

tfidf = TfidfVectorizer(min_df=2, max_df=0.95, stop_words='english')

In [21]:
dtm = tfidf.fit_transform(df_clean['Complaint_clean'])

In [22]:
from sklearn.decomposition import NMF

In [23]:
#Load your nmf_model with the n_components i.e 5
num_topics = 5

#keep the random_state =40
nmf_model = NMF(n_components=num_topics, random_state=40)

In [None]:
nmf_model.fit(dtm)
len(tfidf.get_feature_names_out())

In [26]:
H = nmf_model.components_

In [None]:
#Print the Top15 words for each of the topics
words = np.array(tfidf.get_feature_names_out())
topic_words = pd.DataFrame(np.zeros((num_topics, 15)), index=[f'Topic {i + 1}' for i in range(num_topics)],
                           columns=[f'Word {i + 1}' for i in range(15)]).astype(str)
for i in range(num_topics):
    ix = H[i].argsort()[::-1][:15]
    topic_words.iloc[i] = words[ix]

topic_words

In [29]:
#Create the best topic for each complaint in terms of integer value 0,1,2,3 & 4
topic_results = nmf_model.transform(dtm)

In [30]:
#Assign the best topic to each of the cmplaints in Topic Column
df_clean['Topic'] = topic_results.argmax(axis=1)

In [None]:
df_clean.head()

In [None]:
#Print the first 5 Complaint for each of the Topics
df_clean_5=df_clean.groupby('Topic').head(5)
df_clean_5.sort_values('Topic')

In [33]:
#Create the dictionary of Topic names and Topics
Topic_names = { 0:"Bank account services", 1:"Credit card / Prepaid card", 2:"Others",
               3:"Theft/Dispute reporting", 4:"Mortgages/loans" }
#Replace Topics with Topic Names
df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [None]:
############## PREDICTION

In [34]:
#Create the dictionary again of Topic names and Topics
Topic_names = { "Bank account services":0, "Credit card / Prepaid card":1, "Others":2,
               "Theft/Dispute reporting":3, "Mortgages/loans":4 }
#Replace Topics with Topic Names
df_clean['Topic'] = df_clean['Topic'].map(Topic_names)

In [35]:
training_data = df_clean[['complaint_what_happened', 'Topic']]

In [36]:
#Write your code to get the Vector count
vect = CountVectorizer()
X_train_cnt = vect.fit_transform(training_data['complaint_what_happened'])

# Save word vector
pickle.dump(vect.vocabulary_, open("count_vector.pk1", "wb"))

In [37]:
#Write your code here to transform the word vector to tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_cnt)

# Save tfidf
pickle.dump(tfidf_transformer, open('tfidf.pk1', "wb"))

In [38]:
# Importing LogisticRegression from sklearn
from sklearn.linear_model import LogisticRegression
# Importing Train, Test Split
from sklearn.model_selection import train_test_split

In [39]:
# Train, Test Split
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, training_data['Topic'], test_size=0.25, random_state=42)

In [40]:
logreg = LogisticRegression(random_state=42, solver='liblinear').fit(X_train, y_train)

In [None]:
# Getting the score of the base model
logreg.score(X_test, y_test)