In [8]:
#Generic Packages
import numpy as np
import pandas as pd
import joblib
import os
import re
import joblib
import pickle

#Keras Packages
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras import utils
from keras.models import Sequential
from keras.layers import Dense,Dropout,Activation
from keras import metrics
from sklearn import preprocessing
from keras import backend

import tensorflow
print(tensorflow.__version__)

import keras
print(keras.__version__)

#NLTK Packages
from nltk.corpus import stopwords
stop = stopwords.words('english')

#Spliting Package
from sklearn.model_selection import StratifiedShuffleSplit

from keras.models import model_from_json
from keras.models import model_from_yaml



1.12.0
2.2.4


In [36]:
data = pd.read_csv("..\\kaggle_dataset.csv")

In [10]:
#Regex for any data cleansing activities
def RegexRemoval(data):
    removed_spc=[]
    for i in data['Content']:
        removal_spc=re.sub('[^a-zA-Z]',' ',i)
        removal_spc=re.sub(r'\.+','.', removal_spc)
        removed_spc.append(removal_spc)
    return removed_spc

In [11]:
#feature engineering steps like stemming,lemmatization,tokenization can be handled below
def cleanData(text, lowercase = True, remove_stops = True, stemming = False, lemmatization = False):
        
    txt = str(text)
    if lowercase:
        txt = " ".join([w.lower() for w in txt.split()])

    if remove_stops:
        txt = " ".join([w for w in txt.split() if w not in stop])
        
    if stemming:
        st = PorterStemmer() #choose different stemmers like lancaster for testing activities
        txt = " ".join([st.stem(w) for w in txt.split()])

    if lemmatization:
        wordnet_lemmatizer = WordNetLemmatizer()
        txt = " ".join([wordnet_lemmatizer.lemmatize(w, pos='v') for w in txt.split()])
    return txt

In [12]:
def MLSplit(data):
    X = data.content
    y = data.result
    print(type(X))
    print(type(y))
    size = 0.1

    #Stratified shuffle Split is used. Please use random split(if required)
    dataSplit = StratifiedShuffleSplit(n_splits=5, test_size=size, random_state=0)
    for train_index, validation_index in dataSplit.split(X,y):
        X_train, X_validation = X[train_index], X[validation_index]
        y_train, y_validation = y[train_index], y[validation_index]

    X_train = X_train[:]
    X_validation = X_validation[:]
    print(type(X_train))
    print(type(y_train))

    trainData = pd.concat([X_train,y_train],axis=1)
    validateData = pd.concat([X_validation,y_validation],axis=1)
    print("Train Data Features:",trainData.shape)
    print("Validation Data Features:",validateData.shape)
    
    return X_train,y_train

In [13]:
def DLTrain(data):
    
    #RegEx to remove the alpha numerical data
    cleanedData=RegexRemoval(data)
    cleanedData = pd.DataFrame(cleanedData)
    cleanedData.rename(columns={0:'content'},inplace=True)
    
    data = pd.concat([data,cleanedData],axis=1)
       
    #Pre-Procesed Data Frame
    data = data[['content','Result']]
    data.rename(columns={'Result':'result'},inplace=True)
    print(data.head())
    
    data['content'] = data['content'].map(lambda x: cleanData(x, lowercase=False, remove_stops=True, stemming=False, lemmatization = False))
    
    X,y = MLSplit(data)
   ########################### Hyper Parameter Configurations #####################
    model = Sequential()
    model.add(Dense(units=1000, activation='relu', input_shape=(1000,)))
    model.add(Dropout(0.2))
    model.add(Dense(units=1024, activation='relu', input_shape=(1000,)))
    model.add(Dropout(0.2))
    model.add(Dense(units=9, activation='sigmoid'))
    model.summary()
    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc',metrics.categorical_accuracy])
    #################################################################################
    
    #Keras Tokenizer
    num_max = 1000
    tok = Tokenizer(num_words=num_max)
    tok.fit_on_texts(X)
    X = tok.texts_to_matrix(X,mode='count')
    
    #Label Encoder
    encoder=preprocessing.LabelEncoder()
    encoder.fit(y)
    y=encoder.transform(y)
    num_classes = np.max(y) + 1
    y = utils.to_categorical(y,num_classes)
    
    #Model Building
    model.fit(X, y, epochs=10, batch_size=500,verbose=1,validation_split=0.2)
    
    model_yaml = model.to_yaml()
    model_yaml
    
    model_json = model.to_json()
    model_json
    
    #saving The Models
    with open('..\\models\\tokenizer.pkl', 'wb') as f:
        pickle.dump(tok,f)
        
    with open("..\\models\\model.yaml", "w") as yaml_file:
        yaml_file.write(model_yaml)
    
    with open("..\\models\\model.json", "w") as json_file:
        json_file.write(model_json)
        
    model.save_weights("..\\models\\model.h5")
    print("Saved model to disk")
    
    with open('..\\models\\encoder.pkl', 'wb') as f:
        pickle.dump(encoder,f)


In [14]:
DLTrain(data)

                                             content  \
0  I was contacted originally on XX XX XXXX via c...   
1  Thanks for your response and update regarding ...   
2  Dear Consumers Financial Protection Bureau   C...   
3  Experian XXXX XXXX XXXX XXXX XXXX XXXX XXXX  X...   
4  Doctors Business Bureau for            That de...   

                      result  
0   Action > Debt collection  
1          Action > Mortgage  
2      Action > Student loan  
3  Action > Credit reporting  
4   Action > Debt collection  
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
Train Data Features: (6823, 2)
Validation Data Features: (759, 2)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dro

In [33]:
def validation(text):

    #Loading Tokenizer
    with open('..\\models\\tokenizer.pkl', 'rb') as f:
        tok = pickle.load(f)

    #loading hyperparameter
    with open("..\\models\\model.yaml", "r") as yaml_file:
        model_yaml = yaml_file.read()

    #loading model
    model = model_from_yaml(model_yaml)
    model.load_weights("..\\models\\model.h5")

    #loading encoder for converting encoded labels to actual labels
    with open('..\\models\\encoder.pkl', 'rb') as f:
        encoder = pickle.load(f)

    #inline text
    testdataL = text



    #tokenization of inline text
    X_test=tok.texts_to_matrix(testdataL,mode='count')

    #predicting for inline text
    prediction = model.predict(np.array(X_test))

    #exracting the labels for the predicted text
    text_labels = encoder.classes_
    predicted_label = text_labels[np.argmax(prediction)]
    print("predicted category -->",predicted_label)

    prediction_prob = model.predict_proba(np.array([X_test[0]]))
    confidence = prediction_prob[0][np.argmax(prediction_prob)]
    print("confidence score -->",str(round(confidence,2)))


In [34]:
text = ["I want a student loan"]
validation(text)

predicted category --> Action > Student loan
confidence score --> 0.56
