In [None]:
import pandas as pd
import ast
import csv
import nltk
import re

nltk.download("punkt")

In [None]:
##################################### Feature Enginnering ###################################################

In [None]:
df = pd.read_csv('Aboutlabeled.csv')

In [None]:
df[['Article', 'Line']] = df['ID'].str.split(pat='.', n=1, expand=True).values
del df['ID']

In [None]:
df['Offsets'] = df['Offsets'].apply(lambda s: ast.literal_eval(s))

In [None]:
# Creating Multi label in Fasttext format
df['Labels'] = df['Offsets'].apply(lambda l: ' '.join(list(set(['__label__' + i['label'] for i in l]))))
df.loc[df['Label'] == 'None', 'Labels'] = '__label__None'

In [None]:
# Preprocess the data and toneize
df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(x.lower()))))

In [None]:
# Combining in the Fasttext format
df['FastText'] = df['Labels'] + ' ' + df['Text']
df.head(10)

In [None]:
# Exporting Files into Train and Validation
df.head(13625)[['FastText']].to_csv('pb.train.txt', header=None, index=None, quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")
df.tail(3028)[['FastText']].to_csv('pb.valid.txt', header=None, index=None, quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")

In [None]:
##################################### Hyperopt ###################################################

In [None]:
# Dummy Classfier for Registring the model in MLFlow
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

In [None]:
# Setting MLflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import mlflow
tracking_uri = 'http://mlflo-mlflo-kg1i011s8hid-60b8dc955cae2952.elb.us-east-1.amazonaws.com'
mlflow.set_tracking_uri(tracking_uri)
experiment_name = 'pitchbook-about'
mlflow.set_experiment(experiment_name)

In [None]:
# Define Hyperopt objective function
from sklearn.metrics import f1_score
import fasttext

valid = pd.read_csv("pb.valid.txt", header=None, sep='\t', names=['text'])
valid['text'] = valid['text'].str.split(' ').apply(lambda l: ' '.join([i for i in l if '__label__' not in i]))

def objective(params):
    lr = params['lr']
    epoch = int(params['epoch'])
    wordNgrams = int(params['wordNgrams'])
    threshold = params['threshold']
    
    model = fasttext.train_supervised(input="pb.train.txt", loss='ova', lr=lr, epoch=epoch, wordNgrams=wordNgrams)
   
    predictions = []
    for i in range(valid.shape[0]):
        pred = model.predict(valid['text'][i], k=-1, threshold=threshold)
        predictions.append((list(pred[0]).__len__() == 1) & ('__label__None' in list(pred[0])))

    predictions = ['None' if i else 'About' for i in predictions]
    
    actuals = df.tail(3028).copy()
    actuals['predictions'] = predictions
    
    y_true = actuals['Label'].copy()
    y_pred = actuals['predictions'].copy()
    
    score = f1_score(y_true, y_pred, average=None)[0]
    
    return {'loss': -score, 'params': params, 'status': STATUS_OK}

In [None]:
# Defining Hyperopt search space
space = { 
    'lr': hp.uniform('lr', 0.1, 1),
    'epoch': hp.quniform('epoch', 5, 50, 1),
    'wordNgrams': hp.quniform('wordNgrams', 1, 6, 1),
    'threshold': hp.uniform('threshold', 0.01, 0.4)
}

In [None]:
# Run Hyperopt with MLflow tracking
trials = Trials()

with mlflow.start_run(run_name='nltk-multi-label'):
    argmin = fmin(
        fn=objective,
        space=space,
        algo=tpe.suggest,
        max_evals=50,
        trials=trials)  
    
    lr = argmin['lr']
    epoch = int(argmin['epoch'])
    wordNgrams = int(argmin['wordNgrams'])
    threshold = argmin['threshold']
    
    model = fasttext.train_supervised(input="pb.train.txt", loss='ova', lr=lr, epoch=epoch, wordNgrams=wordNgrams)
   
    predictions = []
    for i in range(valid.shape[0]):
        pred = model.predict(valid['text'][i], k=-1, threshold=threshold)
        predictions.append((list(pred[0]).__len__() == 1) & ('__label__None' in list(pred[0])))

    predictions = ['None' if i else 'About' for i in predictions]
    
    actuals = df.tail(3028).copy()
    actuals['predictions'] = predictions
    
    y_true = actuals['Label'].copy()
    y_pred = actuals['predictions'].copy()
    
    score = f1_score(y_true, y_pred, average=None)[0]

    mlflow.log_param("lr", lr)
    mlflow.log_param("epoch", epoch)
    mlflow.log_param("wordNgrams", wordNgrams)
    mlflow.log_param("threshold", threshold)  
    
    mlflow.log_metric("f1_score", score)
    mlflow.sklearn.log_model(clf, "model")

In [None]:
################## Final Model ###################################

In [None]:
# Training the model using optimized hyperparameters
import fasttext
model = fasttext.train_supervised(input="pb.train.txt", lr=	0.7468864248632575, epoch=27, wordNgrams=4, loss='ova')

In [None]:
# Doing a single text inference on local model
import nltk
import re

nltk.download("punkt")

text = "Early on, ORCO changed its name to Organic Dyestuffs Corporation and, in 2014, further changed the name to Organic Dyes and Pigments LLC, reflecting its well established heritage in both pigments and dyes."
text =  re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(text.lower())))

pred = model.predict(text, k=-1, threshold=0.018211034677959557)
pred_bool = (list(pred[0]).__len__() == 1) & ('__label__None' in list(pred[0]))

if pred_bool:
    print('None')
else:
    print('About')

In [None]:
# Saving the model to deploy to Sagemaker
model.save_model("final_model.bin")

In [None]:
###################### Deployment #########################

In [None]:
# Setting up Sagemaker defaults 
import sagemaker
from sagemaker import get_execution_role
import boto3
import json

sess = sagemaker.Session()
role = get_execution_role()
bucket = sess.default_bucket()
prefix = "fasttext/pretrained"
region_name = 'us-east-1'
container = sagemaker.amazon.amazon_estimator.image_uris.retrieve("blazingtext", region_name, "1")
model_location = 's3://sagemaker-us-east-1-943579580584/fasttext/pretrained/final_model.tar.gz'

In [None]:
# Deploy the model
pb_about = sagemaker.Model(
    image_uri=container, model_data=model_location, role=role, sagemaker_session=sess
)
pb_about.deploy(initial_instance_count=1, instance_type="ml.t2.medium")

from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import JSONSerializer

predictor = sagemaker.Predictor(
    endpoint_name=pb_about.endpoint_name,
    sagemaker_session=sess,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)

In [None]:
import nltk
import re
import pandas as pd

nltk.download("punkt")

In [None]:
# Doing a single text inference on deployed Sagemaker model using Sagemaker SDK

text = "Headquartered"
text =  re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(text.lower())))

sentences = [text]

payload = {"instances": sentences,
          "configuration": {"k": 5}}

predictions = predictor.predict(payload)
predictions = pd.DataFrame.from_dict(predictions[0])

if ('__label__None' in predictions['label'].values) & (predictions[predictions['label'] == '__label__None']['prob'].values[0] > 0.999):
    print('None')
else:
    print("About")

In [None]:
# Doing a single text inference on deployed Sagemaker model using Python SDK boto3
import os
import io
import boto3
import json
import csv

runtime = boto3.client('runtime.sagemaker')

text = "David founded Independent Financial Consultants (IFC Finance.com), a successful Financial Services Advisory firm which provides consultancy on Financial Planning and Wealth Management to business owners and professional firms, in Ireland and UK."
text =  re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(text.lower())))
sentences = [text]
payload = {"instances": sentences,
          "configuration": {"k": 5}}
payload = json.dumps(payload, indent = 4)

response = runtime.invoke_endpoint(EndpointName="blazingtext-2021-08-09-17-57-44-099",
                                   ContentType='application/JSON',
                                   Body=payload)

predictions = json.loads(response['Body'].read().decode())
predictions = pd.DataFrame.from_dict(predictions[0])

if ('__label__None' in predictions['label'].values) & (predictions[predictions['label'] == '__label__None']['prob'].values[0] > 0.999):
    predicted_label = {'label': 'None'}
else:
    predicted_label = {'label': 'About'}
print(predicted_label)

In [None]:
############################ Lambda and API Gateway ############################

In [None]:
import os
import io
import boto3
import json
import csv
import re
import nltk
import pandas as pd

nltk.data.path.append("/tmp")
nltk.download("punkt", download_dir = "/tmp")

# grab environment variables
ENDPOINT_NAME = os.environ['ENDPOINT_NAME']
runtime = boto3.client('runtime.sagemaker')

def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))
    
    data = json.loads(json.dumps(event))
    text = data['data']
    print(text)
    
    text =  re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(text.lower())))
    sentences = [text]
    payload = {"instances": sentences,
              "configuration": {"k": 5}}
    payload = json.dumps(payload, indent = 4)

    response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
                                       ContentType='application/JSON',
                                       Body=payload)

    predictions = json.loads(response['Body'].read().decode())
    predictions = pd.DataFrame.from_dict(predictions[0])

    if ('__label__None' in predictions['label'].values) & (predictions[predictions['label'] == '__label__None']['prob'].values[0] > 0.999):
        predicted_label = {'label': 'None'}
    else:
        predicted_label = {'label': 'About'}
    print(predicted_label)
    return predicted_label

In [None]:
############################### GitHub CI/CD #############################

In [None]:
import pandas as pd
import ast
import csv
import nltk
import re
import fasttext
from sklearn.metrics import f1_score

nltk.download("punkt")

def returntestscore():
    df = pd.read_csv('Aboutlabeled.csv')

    df[['Article', 'Line']] = df['ID'].str.split(pat='.', n=1, expand=True).values
    del df['ID']

    df['Offsets'] = df['Offsets'].apply(lambda s: ast.literal_eval(s))

    df['Labels'] = df['Offsets'].apply(lambda l: ' '.join(list(set(['__label__' + i['label'] for i in l]))))

    df.loc[df['Label'] == 'None', 'Labels'] = '__label__None'

    df['Text'] = df['Text'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '',' '.join(nltk.word_tokenize(x.lower()))))

    df['FastText'] = df['Labels'] + ' ' + df['Text']

    df.head(13625)[['FastText']].to_csv('pb.train.txt', header=None, index=None, quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")
    df.tail(3028)[['FastText']].to_csv('pb.valid.txt', header=None, index=None, quoting=csv.QUOTE_NONE, quotechar="",  escapechar="\\")

    ################## Final Model ###################################
    model = fasttext.train_supervised(input="pb.train.txt", lr=	0.7468864248632575, epoch=27, wordNgrams=4, loss='ova')

    valid = pd.read_csv("pb.valid.txt", header=None, sep='\t', names=['text'])
    valid['text'] = valid['text'].str.split(' ').apply(lambda l: ' '.join([i for i in l if '__label__' not in i]))

    predictions = []
    for i in range(valid.shape[0]):
        pred = model.predict(valid['text'][i], k=-1, threshold=0.018211034677959557)
        predictions.append((list(pred[0]).__len__() == 1) & ('__label__None' in list(pred[0])))

    predictions = ['None' if i else 'About' for i in predictions]

    actuals = df.tail(3028).copy()
    actuals['predictions'] = predictions

    y_true = actuals['Label'].copy()
    y_pred = actuals['predictions'].copy()

    score = f1_score(y_true, y_pred, average=None)[0]
    
    return score


In [None]:
"""Unit test file for app.py"""
from app import returntestscore
import unittest

class TestApp(unittest.TestCase):
    """Unit tests defined for app.py"""

    def test_f1_score(self):
        """Test f1 score"""
        score = returntestscore()
        self.assertGreater(score, 0.5)

if __name__ == "__main__":
    unittest.main()