# Install requirements

In [6]:
!pip install tensorflow_addons numpy pandas tensorflow sklearn nltk spacy textblob gensim scipy seaborn matplotlib minio mlflow wordcloud


Collecting tensorflow_addons
  Downloading tensorflow_addons-0.14.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 25.7 MB/s eta 0:00:01
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 117.2 MB/s eta 0:00:01
[?25hCollecting spacy
  Downloading spacy-3.1.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.1 MB)
[K     |████████████████████████████████| 6.1 MB 26.1 MB/s eta 0:00:01
[?25hCollecting textblob
  Downloading textblob-0.15.3-py2.py3-none-any.whl (636 kB)
[K     |████████████████████████████████| 636 kB 112.1 MB/s eta 0:00:01
[?25hCollecting gensim
  Downloading gensim-4.1.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1 MB)
[K     |████████████████████████████████| 24.1 MB 31.6 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.7.1-cp38-cp

# Load Libraries

In [52]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn.feature_extraction.text as text
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from textblob import TextBlob
from nltk.stem import PorterStemmer,SnowballStemmer
from textblob import Word
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from wordcloud import WordCloudfrom wordcloud import WordCloud
from io import StringIO
import string
import gensim
from gensim.models import Word2Vec
import itertools
import scipy
from scipy import spatial
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
import joblib

import mlflow
import warnings

from minio import Minio
import subprocess
import ipynbname
warnings.filterwarnings("ignore")
import absl.logging
absl.logging.set_verbosity(absl.logging.ERROR)


tokenizer = ToktokTokenizer()
# stopword_list = nltk.download('stopwords')

# Define the config file for mlflow and minio

In [55]:
HOST = "http://mlflow:5500"

PROJECT_NAME = "NlpTc"
EXPERIMENT_NAME = "NlpLstm"

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='raw-data-saeed'

In [56]:
def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient

In [57]:
client = get_s3_server()

# Load MLFlow to track the model

In [58]:
# from verta.utils import ModelAPI

# Connect to local MLflow tracking server
mlflow.set_tracking_uri(HOST)

# Set the experiment name...
mlflow.set_experiment(EXPERIMENT_NAME)

mlflow.tensorflow.autolog()

INFO: 'NlpLstm' does not exist. Creating a new experiment


In [59]:

def get_git_revision_hash():
    return subprocess.check_output(['git', 'rev-parse', 'HEAD'])

def get_git_revision_short_hash():
    return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'])

def get_git_remote():
    return subprocess.check_output(['git', 'config', '--get', 'remote.origin.url'])

def get_git_user():
    return subprocess.check_output(['git', 'config', 'user.name'])

def get_git_branch():
    return subprocess.check_output(['git', 'branch', '--show-current'])

def get_pip_freeze():
    return subprocess.check_output(['pip', 'freeze']).splitlines()


def record_details(mlflow):
    """
    This method is the anchor poijt and more activiteis will go in it
    :param mlflow:
    :return:
    """
    with open("pip_freeze.txt", "wb") as file:
        for line in get_pip_freeze():
            file.write(line)
            file.write(bytes("\n", "UTF-8"))
    mlflow.log_artifact("pip_freeze.txt")
    file.close()
    mlflow.log_artifact("model.h5", artifact_path="model")
    mlflow.log_artifact("tokenizer.pkl", artifact_path="model")
    mlflow.log_artifact("labelencoder.pkl", artifact_path="model")
    
    os.remove("pip_freeze.txt")
    os.remove("model.h5")
    os.remove("tokenizer.pkl")
    os.remove("labelencoder.pkl")



def mlflow_grid_search(methodtoexecute, methodarguments):
    with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        methodtoexecute(**methodarguments)
        record_details(mlflow)

    return run

In [60]:
def fetch_logged_data(run_id):
    client = mlflow.tracking.MlflowClient()
    data = client.get_run(run_id).data
    tags = {k: v for k, v in data.tags.items() if not k.startswith("mlflow.")}
    artifacts = [f.path for f in client.list_artifacts(run_id, "model")]
    return data.params, data.metrics, tags, artifacts

In [61]:
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [62]:
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to /opt/app-
[nltk_data]     root/src/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Readinng the data
Reading the data from S3 bucket


In [63]:
csv_file = client.get_object("raw-data-saeed", "data.csv")
df1 = pd.read_csv(csv_file)

# Word Cloud for all Product categories


In [66]:
# for product_name in df1['product'].unique():
#     print(product_name)
#     all_words = ' '.join([text for text in df1.loc[df1['product'].str.contains(product_name),'consumer_complaint_narrative']])
    
#     wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)

#     plt.figure(figsize=(10, 7))
#     plt.imshow(wordcloud, interpolation="bilinear")
#     plt.axis('off')
#     plt.show()


In [67]:
df1.shape

(2000, 13)

### Train/Test split


In [68]:
train_x, valid_x, train_y, valid_y = train_test_split(df1['consumer_complaint_narrative'], df1['product'],stratify=df1['product'], 
                                                    test_size=0.30)



In [69]:
input_x = train_x
ind = 2
train_x.iloc[ind]
train_y.iloc[ind]

'Credit reporting'

Feature engineering of consumer complaint with TF-IDF

In [70]:

##label encoding target variable
enc = preprocessing.LabelEncoder()
train_labels = enc.fit_transform(train_y)
test_labels = enc.fit_transform(valid_y)

print(enc.classes_)
print(np.unique(train_labels, return_counts=True))
print(np.unique(test_labels, return_counts=True))



['Bank account or service' 'Consumer Loan' 'Credit card'
 'Credit reporting' 'Debt collection' 'Money transfers' 'Mortgage'
 'Other financial service' 'Payday loan' 'Prepaid card' 'Student loan']
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([104,  79, 161, 212, 382,  19, 350,   2,  23,   8,  60]))
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10]), array([ 45,  33,  69,  91, 163,   8, 150,   1,  10,   4,  26]))


In [71]:

##tf-idf verctor representation
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(df1['consumer_complaint_narrative'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)


## Deep Learning models


In [72]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D, Conv1D, SimpleRNN
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding
from tensorflow.keras.models import Sequential


In [73]:

total_complaints = np.append(train_x.values,valid_x.values)
tokenizer = Tokenizer(num_words=2000)
tokenizer.fit_on_texts(train_x.values)#total_complaints

train_sequences = tokenizer.texts_to_sequences(train_x.values)
test_sequences = tokenizer.texts_to_sequences(valid_x.values)


In [74]:

word_index = tokenizer.word_index# dictionary containing words and their index
print('Found %s unique tokens.' % len(word_index))


Found 7003 unique tokens.


In [75]:

MAX_SEQUENCE_LENGTH = max([len(c.split()) for c in total_complaints])
MAX_SEQUENCE_LENGTH


348

In [76]:

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH,padding='post')
print(train_data.shape)
print(test_data.shape)


(1400, 348)
(600, 348)


In [77]:
enc.classes_

array(['Bank account or service', 'Consumer Loan', 'Credit card',
       'Credit reporting', 'Debt collection', 'Money transfers',
       'Mortgage', 'Other financial service', 'Payday loan',
       'Prepaid card', 'Student loan'], dtype=object)

In [78]:

labels_train = to_categorical(np.asarray(train_labels))
labels_test = to_categorical(np.asarray(test_labels))
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', labels_train.shape)
print('Shape of label tensor:', labels_test.shape)


Shape of data tensor: (1400, 348)
Shape of label tensor: (1400, 11)
Shape of label tensor: (600, 11)



## CNN w/ Pre-trained word embeddings(GloVe)
We’ll use pre-trained embeddings such as Glove which provides word based vector representation trained on a large corpus.

It is trained on a dataset of one billion tokens (words) with a vocabulary of 400 thousand words. The glove has embedding vector sizes, including 50, 100, 200 and 300 dimensions.



In [79]:
# !wget http://nlp.stanford.edu/data/glove.6B.zip
f = client.get_object("raw-data-saeed", "glove.6B.50d.txt")
embeddings_index = {}
# f = open(os.path.join(GLOVE_DIR, 'glove.6B.50d.txt'))
# f = open( 'glove.6B.50d.txt')
for line in f:
    # print(line.decode("utf-8") )
    line = line.decode("utf-8")
    # break
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


Now lets create the embedding matrix using the word indexer created from tokenizer.


In [80]:

EMBEDDING_DIM = 50
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector


Lets check the word embedded vector representation for token ‘loan’ in our embedding matrix


In [81]:


[(k,v) for k,v in word_index.items() if v==4]


[('loan', 4)]

In [82]:
embedding_matrix[4]  ## word embedded vector representation for token 'loan'

array([ 0.93484002,  0.40450999,  0.10856   , -0.61953998, -0.69220001,
        0.32119   , -0.70885003,  0.071233  , -0.33484   ,  0.77158999,
       -0.050077  ,  1.14460003,  0.01926   , -1.02590001,  0.85535002,
       -0.081615  ,  0.19649   , -0.051262  ,  0.40103999,  0.87255001,
        0.95371002, -0.87009001, -0.81568998, -0.24765   , -1.44400001,
       -0.88612998,  1.51440001, -0.014284  , -0.48023999, -0.32289001,
        3.00580001,  0.49408999,  0.72916001,  0.60891002,  0.59543997,
        0.49731001, -0.0057787 , -0.21278   ,  0.94937998, -2.16849995,
        0.12593   , -0.56818998,  0.50354999,  0.013716  , -1.01310003,
       -0.46805999,  0.17305   ,  1.62039995,  0.60404998,  0.063104  ])

In [83]:
vocab_size = len(tokenizer.word_index)+1


In [84]:
## RNN

In [85]:
#Bidirectional LSTM
EMBEDDING_DIM = 50
model = Sequential()
model.add(Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(Bidirectional(LSTM(100, dropout = 0.3, return_sequences=True)))
model.add(Bidirectional(LSTM(256, dropout = 0.3)))
model.add(Dense(11,activation='sigmoid'))
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])


In [86]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 348, 50)           350200    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 348, 200)          120800    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 512)               935936    
_________________________________________________________________
dense_1 (Dense)              (None, 11)                5643      
Total params: 1,412,579
Trainable params: 1,412,579
Non-trainable params: 0
_________________________________________________________________


In [184]:
checkpoint_filepath = 'model.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_acc',
    mode='max',
    save_best_only=True)

In [185]:
x=train_data[1]

In [186]:
# model.fit(train_data, labels_train,
#                  batch_size=64,
#                  epochs=1,
#                  validation_data=(test_data, labels_test),callbacks=[model_checkpoint_callback])


In [187]:
# model2 = tf.keras.models.load_model('model.h5')
# ind =1
# x = [train_data[ind]]
# x = tf.constant(x, dtype=tf.int32)
# y=tf.math.argmax(labels_train[ind])
# print(y)
# pred = tf.math.argmax(tf.sigmoid(model(tf.constant(x))),axis=1)
# print(pred)


In [188]:
joblib.dump(enc, 'labelencoder.pkl')  
joblib.dump(tokenizer, 'tokenizer.pkl')  

with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        model.fit(train_data, labels_train,
                 batch_size=64,
                 epochs=2,
                 validation_data=(test_data, labels_test),callbacks=[model_checkpoint_callback])
        record_details(mlflow)
        

Epoch 1/2
Epoch 2/2
INFO:tensorflow:Assets written to: /tmp/tmphyzqhdyo/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmphyzqhdyo/model/data/model/assets
Traceback (most recent call last):
  File "/opt/app-root/lib/python3.8/site-packages/urllib3/connectionpool.py", line 465, in _make_request
    assert_header_parsing(httplib_response.msg)
  File "/opt/app-root/lib/python3.8/site-packages/urllib3/util/response.py", line 91, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: 'HTTP/1.1 200 OK\r\nAccept-Ranges: bytes\r\nContent-Length: 0\r\nContent-Security-Policy: block-all-mixed-content\r\nETag: "559a5aafbb4f7d3f83bd40e362213f9b"\r\nServer: MinIO\r\nStrict-Transport-Security: max-age=31536000; includeSubDomains\r\nVary: Origin\r\nVary: Accept-Encoding\r\nX-Amz-Request-Id: 16AE086943126AAC\r\nX-Content-Type-Options: nosniff\r\nX-Xss-Protection: 1; mode=block\r\nDate: Thu, 14 Oct 2021 23:09:22 GMT\r\n\r\n'


In [198]:
class BuildModel():
    '''
    Build Lstm model for tensorflow
    ----------

    Returns
    -------
    self.final_set:
        Features for modeling purpose
    self.labels:
        Output labels of the features
    enc: 
        Ordinal Encoder definition file
    ohe:
        One hot  Encoder definition file
    '''
    def __init__(self, emweights = embedding_matrix, EMBEDDING_DIM= 50,MAX_SEQUENCE_LENGTH= 348, loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc']):
        self.weights = [emweights]
        self.input_length = MAX_SEQUENCE_LENGTH
        self.embeding_dim = EMBEDDING_DIM
        self.loss = loss
        self.optimizer = optimizer
        self.metrics = metrics
        self.model = []
        
    def DefineModel(self):
        '''
        Define the model
        ----------
        
        Returns
        -------
        self.model
        '''
        #Bidirectional LSTM
        self.model = Sequential()
        self.model.add(Embedding(len(word_index) + 1,
                                    self.embeding_dim,
                                    weights=self.weights,
                                    input_length=self.input_length ,
                                    trainable=True))
        self.model.add(Bidirectional(LSTM(100, dropout = 0.4, return_sequences=True)))
        self.model.add(Bidirectional(LSTM(256, dropout = 0.4)))
        self.model.add(Dense(11,activation='sigmoid'))
        # return self.final_set,self.labels, self.enc, self.ohe,self.encoding_flag
    def CompileModel(self,loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc']):
        self.model.compile(loss=loss,
              optimizer=optimizer,
              metrics=metrics)
#         return self.model
    def BuildModel(self,loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc']):
        self.DefineModel()
        self.CompileModel()
        return self.model

In [199]:
model2 = BuildModel(emweights = embedding_matrix).BuildModel()

In [206]:
checkpoint_filepath = 'model.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_acc',
    mode='max',
    save_best_only=True)

In [207]:
joblib.dump(enc, 'labelencoder.pkl')  
joblib.dump(tokenizer, 'tokenizer.pkl')  

with mlflow.start_run(tags= {
        "mlflow.source.git.commit" : get_git_revision_hash() ,
        "mlflow.user": get_git_user(),
        "mlflow.source.git.repoURL": get_git_remote(),
        "git_remote": get_git_remote(),
        "mlflow.source.git.branch": get_git_branch(),
        "mlflow.docker.image.name": os.getenv("JUPYTER_IMAGE", "LOCAL"),
        "mlflow.source.type": "NOTEBOOK",
#         "mlflow.source.name": ipynbname.name()
    }) as run:
        model2.fit(train_data, labels_train,
                 batch_size=64,
                 epochs=5,
                 validation_data=(test_data, labels_test),callbacks=[model_checkpoint_callback])
        record_details(mlflow)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
INFO:tensorflow:Assets written to: /tmp/tmpfayeoh0w/model/data/model/assets


INFO:tensorflow:Assets written to: /tmp/tmpfayeoh0w/model/data/model/assets
Traceback (most recent call last):
  File "/opt/app-root/lib/python3.8/site-packages/urllib3/connectionpool.py", line 465, in _make_request
    assert_header_parsing(httplib_response.msg)
  File "/opt/app-root/lib/python3.8/site-packages/urllib3/util/response.py", line 91, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: 'HTTP/1.1 200 OK\r\nAccept-Ranges: bytes\r\nContent-Length: 0\r\nContent-Security-Policy: block-all-mixed-content\r\nETag: "d22c236a0d8124d838665bf11ae54f63"\r\nServer: MinIO\r\nStrict-Transport-Security: max-age=31536000; includeSubDomains\r\nVary: Origin\r\nVary: Accept-Encoding\r\nX-Amz-Request-Id: 16AE0EB70069557D\r\nX-Content-Type-Options: nosniff\r\nX-Xss-Protection: 1; mode=block\r\nDate: Fri, 15 Oct 2021 01:04:53 GMT\r\n\r\n'


In [140]:
# !pip install mlflow
# !pip install minio
# !pip install boto3
# !pip install scikit-learn==0.24.2
# !pip install openshift-client==1.0.13
# !pip show mlflow
# !pip show minio
# !pip show boto3
# !pip show scikit-learn
# !pip show openshift-client

import os
import mlflow
from minio import Minio
import openshift as oc
from jinja2 import Template

os.environ['MLFLOW_S3_ENDPOINT_URL']='http://minio-ml-workshop:9000'
os.environ['AWS_ACCESS_KEY_ID']='minio'
os.environ['AWS_SECRET_ACCESS_KEY']='minio123'
os.environ['AWS_REGION']='us-east-1'
os.environ['AWS_BUCKET_NAME']='mlflow'
# os.environ['MODEL_NAME'] = 'rossdemo'
# os.environ['MODEL_VERSION'] = '1'
# os.environ['OPENSHIFT_CLIENT_PYTHON_DEFAULT_OC_PATH'] = '/tmp/oc'

HOST = "http://mlflow:5500"

model_name = 'lstmv18'
model_version = '1'
build_name = f"seldon-model-{model_name}-v{model_version}"

def get_s3_server():
    minioClient = Minio('minio-ml-workshop:9000',
                    access_key='minio',
                    secret_key='minio123',
                    secure=False)

    return minioClient


def init():
    mlflow.set_tracking_uri(HOST)
    print(HOST)
    # Set the experiment name...
    #mlflow_client = mlflow.tracking.MlflowClient(HOST)

    
def download_artifacts():
    print("retrieving model metadata from mlflow...")
    model = mlflow.pyfunc.load_model(
        model_uri=f"models:/{model_name}/{model_version}"
    )
    print(model)
    
    run_id = model.metadata.run_id
    experiment_id = mlflow.get_run(run_id).info.experiment_id
    
    print("initializing connection to s3 server...")
    minioClient = get_s3_server()

#     artifact_location = mlflow.get_experiment_by_name('rossdemo').artifact_location
#     print("downloading artifacts from s3 bucket " + artifact_location)

    data_file_model = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/model.h5", "model.h5")
    data_file_model2 = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/tokenizer.pkl", "tokenizer.pkl")
    data_file_model3 = minioClient.fget_object("mlflow", f"/{experiment_id}/{run_id}/artifacts/model/labelencoder.pkl", "labelencoder.pkl")


    #Using boto3 Download the files from mlflow, the file path is in the model meta
    #write the files to the file system
    print("download successful")
    
    return run_id
    
        
init()
run_id = download_artifacts()

http://mlflow:5500
retrieving model metadata from mlflow...
mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.keras
  run_id: 29b0d88e4848478d940c38a2ec30f985

initializing connection to s3 server...
download successful


In [141]:
import tensorflow as tf
import joblib
import numpy as np
import json
import traceback
import sys
class Predictor(object):

    def __init__(self):
        self.model = tf.keras.models.load_model('model.h5', compile=False)
        self.labelencoder = joblib.load('labelencoder.pkl')



    def predict(self, X,features_names):
        # data = request.get("data", {}).get("ndarray")
        # mult_types_array = np.array(data, dtype=object)
        print ('step1......')
        print(X)
        X = tf.constant(X)
        print ('step2......')
        print(X)
#         result = self.model.predict(X)
        try:
            result = self.model.predict(X)
        except Exception as e:
            print(traceback.format_exception(*sys.exc_info()))
            raise # reraises the exception
        

                
        print ('step3......')
        result = tf.sigmoid(result)
        print ('step4......')
        print(result)
        result = tf.math.argmax(result,axis=1)
        print ('step5......')
        print(result)
        print(result.shape)
        
        print(self.labelencoder.inverse_transform(result))
        print ('step6......')
        return json.dumps(result, cls=JsonSerializer)

class JsonSerializer(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (
        np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64)):
            return int(obj)
        elif isinstance(obj, (np.float_, np.float16, np.float32, np.float64)):
            return float(obj)
        elif isinstance(obj, (np.ndarray,)):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [142]:

import pandas as pd
import joblib
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

import json

class Transformer(object):
    def __init__(self):
        self.tokenizer = joblib.load('tokenizer.pkl')
        
    def transform_input(self, X, feature_names, meta):
        # print(request)
        X = X.get("data", {}).get("ndarray")
        print(X)
        output = self.tokenizer.texts_to_sequences(X)
        print(X)
        
        print(output)
        output = pad_sequences(output, maxlen=348,padding='post')
        print(output)
#         output = tf.constant(output)
#         print(output)
        return output


# import tensorflow as tf
# import pandas as pd
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# class Transformer(object):
#     def __init__(self):
#         self.ordinalencoder = joblib.load('ordinalencoder.pkl')
        
#     def transform_input(self, request):
#         X = request.get("data", {}).get("ndarray")
#         feature_names = request.get("data", {}).get("names")
        
#         df = pd.DataFrame(X, columns=feature_names)
#         df = ordinalencoder.transform(df)
#         df = onehotencoder.transform(df)
#         train_sequences = tokenizer.texts_to_sequences(train_x.values)


#         #df = df.drop(['customerID'], axis=1)
#         return df.to_numpy()


In [None]:
model = joblib.load('model.h5')


In [168]:
ind = 2
train_y.iloc[ind]
train_x.iloc[ind]
train_y.iloc[ind]
train_x.iloc[ind]



'trying since 2015 get judgment report sued company court granted vacated judgment 2015 notified bureau time delete judgment within last week called talked transition told take long time verify information received letter yesterday told cleared 2015 bureau put judgment report 5 day filed transition telling take 90 day get trying qualify purchase home victim incompetence'

In [143]:
sample_data = {"data":
  {


        "names":
            [
              "Debt collection"
            ],
      "ndarray": ["could longer pay enormous charge hired company nl take either nothing pay day loan company accept term get several letter week threatened take civil action get check"]

  }
}


In [144]:
ready_data = Transformer().transform_input(sample_data,sample_data,sample_data)

['could longer pay enormous charge hired company nl take either nothing pay day loan company accept term get several letter week threatened take civil action get check']
['could longer pay enormous charge hired company nl take either nothing pay day loan company accept term get several letter week threatened take civil action get check']
[[35, 263, 21, 49, 787, 14, 1549, 88, 454, 188, 21, 22, 4, 14, 469, 321, 25, 94, 18, 121, 859, 88, 1371, 201, 25, 50]]
[[  35  263   21   49  787   14 1549   88  454  188   21   22    4   14
   469  321   25   94   18  121  859   88 1371  201   25   50    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0 

In [None]:
# model2 = tf.keras.models.load_model('model.h5')
# ind =1
# x = [train_data[ind]]
# x = tf.constant(x, dtype=tf.int32)
# y=tf.math.argmax(labels_train[ind])
# print(y)
# pred = tf.math.argmax(tf.sigmoid(model(tf.constant(x))),axis=1)
# print(pred)

In [145]:
output = Predictor().predict(ready_data,ready_data)

step1......
[[  35  263   21   49  787   14 1549   88  454  188   21   22    4   14
   469  321   25   94   18  121  859   88 1371  201   25   50    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0 

TypeError: Object of type EagerTensor is not JSON serializable

In [None]:
model.predict(ready_data)

In [None]:
history = model.fit(train_data, labels_train,
 batch_size=64,
 epochs=80,
 validation_data=(test_data, labels_test))

In [None]:
fig1 = plt.figure()
plt.plot(history.history['loss'],'r',linewidth=3.0)
plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Loss',fontsize=16)
plt.title('Loss Curves :RNN - LSTM',fontsize=16)
plt.show()

In [None]:
fig1 = plt.figure()
plt.plot(history.history['acc'],'r',linewidth=3.0)
plt.plot(history.history['val_acc'],'b',linewidth=3.0)
plt.legend(['Training acc', 'Validation acc'],fontsize=18)
plt.xlabel('Epochs ',fontsize=16)
plt.ylabel('Accuracy',fontsize=16)
plt.title('Accuracy Curves :RNN - LSTM',fontsize=16)
plt.show()

In [None]:
#predictions on test data
predicted=model.predict(test_data)
predicted

In [None]:
#model evaluation
import sklearn
from sklearn.metrics import precision_recall_fscore_support as score
precision, recall, fscore, support = score(labels_test, predicted.round())
print('precision: \n{}'.format(precision))
print('recall: \n{}'.format(recall))
print('fscore: \n{}'.format(fscore))
print('support: \n{}'.format(support))
print("############################")

In [None]:
print(classification_report(labels_test, predicted.round(),target_names=df1['product'].unique()))


After hours of training we get good results with LSTM(type of recurrent neural network) compared to CNN. From the learning curves it is clear the model needs to be tuned for overfitting by selecting hyperparameters such as no of epochs via early stopping and dropout for regularization.

We could further improve our final result by ensembling our xgboost and Neural network models by using Logistic Regression as our base model.

