### Import Libraries

In [None]:
!pip install mlrun

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
from mlrun import mlconf

mlconf.dbpath = '/content/drive/MyDrive/Hackathon'
mlconf.artifact_path = '/content/drive/MyDrive/Hackathon/Data'



In [2]:
from os import path
import mlrun

project_name_base = 'suicide-pred'

project_name, artifact_path = mlrun.set_environment(project=project_name_base, user_project=True)

print(f'Project name: {project_name}')
print(f'Artifact path: {artifact_path}')

Project name: suicide-pred-root
Artifact path: /content/drive/MyDrive/Hackathon/Data


<h2>MLRun Functions

In [3]:
# nuclio: start-code

In [4]:

import pickle
from pickle import dumps
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import nltk
from os import path
import os


from mlrun.execution import MLClientCtx
from mlrun.datastore import DataItem

# nltk.download()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def fetch_data(context : MLClientCtx, data_path: DataItem):
    
    context.logger.info('Reading data from {}'.format(data_path))
    suicide_dataset = pd.read_csv(str(data_path))
    
    
    target_path = path.join(context.artifact_path, 'data')
    context.logger.info('Saving datasets to {} ...'.format(target_path))

    # Store the data sets in your artifacts database
    context.log_dataset('suicide_dataset', df=suicide_dataset, format='csv',
                        index=False, artifact_path=target_path)
    

In [6]:
def preprocess_tweet(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    text = re.sub('[\W]+', ' ', text.lower())
    text = text+' '.join(emoticons).replace('-', '') 
    return text

In [7]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

from nltk.corpus import stopwords
stop = stopwords.words('english')

In [8]:
def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\(|D|P)',text.lower())
    text = re.sub('[\W]+', ' ', text.lower())
    text += ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in tokenizer_porter(text) if w not in stop]
    return tokenized

In [9]:
from sklearn.feature_extraction.text import HashingVectorizer
vect = HashingVectorizer(decode_error='ignore', n_features=2**21, 
                         preprocessor=None,tokenizer=tokenizer)

In [10]:
def transform_dataset(context : MLClientCtx, data: DataItem):

  context.logger.info('Begin datasets transform')

  df = data.as_df()
  df['tweet'] = df['tweet'].apply(lambda x: preprocess_tweet(x))
  # print(len(df["tweet"].to_list()))
  # k = list(vect.transform(df["tweet"].values))
  # print(len(k))
  # df['tweet'] = list(vect.transform(df["tweet"].values))

  target_path = path.join(context.artifact_path, 'data')
  context.log_dataset('suicide_dataset_transformed', df=df, artifact_path=target_path, format='csv')    


In [11]:
def train_model(context: MLClientCtx, input_ds: DataItem):


  context.logger.info('Begin training')
  from sklearn.linear_model import SGDClassifier
  clf = SGDClassifier(loss='log', random_state=1)

  df = input_ds.as_df()
  X = df["tweet"].to_list()
  y = df['label']

  from sklearn.model_selection import train_test_split
  X_train,X_test,y_train,y_test = train_test_split(X,
                                                  y,
                                                  test_size=0.20,
                                                  random_state=0)  
  X_train = vect.transform(X_train)
  X_test = vect.transform(X_test)

  classes = np.array([0, 1])
  clf.partial_fit(X_train, y_train,classes=classes)

  print('Accuracy: %.3f' % clf.score(X_test, y_test))

  context.log_model('Suicide_Model',
                     body=dumps(clf),
                     artifact_path=context.artifact_subpath("models"),
                     model_file="Suicide_Model.pkl")
  

  context.logger.info('End training')

In [52]:
import numpy as np
from cloudpickle import load

class SuicideModel(mlrun.serving.V2ModelServer):
    
    def load(self):
        model_file, extra_data = self.get_model('.pkl')
        self.model = load(open(model_file, 'rb'))

    def predict(self, body):
        try:
            feats = body['inputs'][0]
            # feats = feats.decode('ISO-8859-1')
            feats = preprocess_tweet(feats)
            l = []
            l.append(feats)
            feats = vect.transform(l)
            print('hi')
            result = self.model.predict(feats)
            return result.tolist()
        except Exception as e:
            raise Exception("Failed to predict %s" % e)

In [53]:
# nuclio: end-code

<h2>MLRun Procedure 

In [13]:
suicide_func = mlrun.code_to_function(name='suicide', kind='job', filename = '/content/drive/MyDrive/Hackathon/code-MLRun.ipynb')

<h3> Fetch

In [14]:
fetch_data_run = suicide_func.run(handler='fetch_data',
                               inputs={'data_path': '/content/drive/MyDrive/Hackathon/suicidal_data.csv'},
                               local=True)

> 2021-07-03 15:34:21,849 [info] starting run suicide-fetch_data uid=ec18b6e60fd54ac2b02ca337265bd200 DB=/content/drive/MyDrive/Hackathon
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
> 2021-07-03 15:34:21,885 [info] Reading data from /content/drive/MyDrive/Hackathon/suicidal_data.csv
> 2021-07-03 15:34:21,970 [info] Saving datasets to /content/drive/MyDrive/Hackathon/Data/data ...


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
suicide-pred-root,...5bd200,0,Jul 03 15:34:21,completed,suicide-fetch_data,kind=owner=roothost=d005ddeacf7e,data_path,,,suicide_dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run ec18b6e60fd54ac2b02ca337265bd200 --project suicide-pred-root , !mlrun logs ec18b6e60fd54ac2b02ca337265bd200 --project suicide-pred-root
> 2021-07-03 15:34:22,290 [info] run executed, status=completed


In [15]:
fetch_data_run.outputs

{'suicide_dataset': 'store://artifacts/suicide-pred-root/suicide-fetch_data_suicide_dataset:ec18b6e60fd54ac2b02ca337265bd200'}

<h3> Test

In [16]:
transform_dataset_run = suicide_func.run(name='transform_dataset',
                                      handler='transform_dataset',
                                      inputs={'data': fetch_data_run.outputs['suicide_dataset']},
                                      local=True)

> 2021-07-03 15:34:22,612 [info] starting run transform_dataset uid=c3f461601bee455790cc09f1b0d6e8bf DB=/content/drive/MyDrive/Hackathon
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
> 2021-07-03 15:34:22,727 [info] Begin datasets transform


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
suicide-pred-root,...d6e8bf,0,Jul 03 15:34:22,completed,transform_dataset,kind=owner=roothost=d005ddeacf7e,data,,,suicide_dataset_transformed


to track results use .show() or .logs() or in CLI: 
!mlrun get run c3f461601bee455790cc09f1b0d6e8bf --project suicide-pred-root , !mlrun logs c3f461601bee455790cc09f1b0d6e8bf --project suicide-pred-root
> 2021-07-03 15:34:23,371 [info] run executed, status=completed


In [17]:
transform_dataset_run.outputs

{'suicide_dataset_transformed': 'store://artifacts/suicide-pred-root/transform_dataset_suicide_dataset_transformed:c3f461601bee455790cc09f1b0d6e8bf'}

<h3> Train

In [18]:
train_model_run = suicide_func.run(name='train_model',
                                handler='train_model',
                                inputs={'input_ds': transform_dataset_run.outputs['suicide_dataset_transformed']},
                                local=True)

> 2021-07-03 15:34:27,760 [info] starting run train_model uid=262bee6ad4d34289825c751cea2e22b0 DB=/content/drive/MyDrive/Hackathon
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
> 2021-07-03 15:34:27,876 [info] Begin training
Accuracy: 0.912
> 2021-07-03 15:34:43,240 [info] End training


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
suicide-pred-root,...2e22b0,0,Jul 03 15:34:27,completed,train_model,kind=owner=roothost=d005ddeacf7e,input_ds,,,Suicide_Model


to track results use .show() or .logs() or in CLI: 
!mlrun get run 262bee6ad4d34289825c751cea2e22b0 --project suicide-pred-root , !mlrun logs 262bee6ad4d34289825c751cea2e22b0 --project suicide-pred-root
> 2021-07-03 15:34:43,659 [info] run executed, status=completed


In [20]:
train_model_run.outputs

{'Suicide_Model': 'store://artifacts/suicide-pred-root/train_model_Suicide_Model:9bd09ee800ff4dd5a29753e24bf5f662'}

<h3> Serving 

In [54]:
serving = mlrun.code_to_function('seving', filename='/content/drive/MyDrive/Hackathon/code-MLRun.ipynb', kind='serving')

serving.spec.default_class = 'SuicideModel'
serving.add_model('suicide-serving', train_model_run.outputs['Suicide_Model'])
# serving_address = serving.deploy()

<mlrun.serving.states.TaskState at 0x7fb8824f6ed0>

In [57]:
my_data = '''{"inputs":["I'll kill myself am tired of living depressed and alone"]}'''

server = serving.to_mock_server()
server.test("/v2/models/suicide-serving/infer", body=my_data)

> 2021-07-03 19:38:08,672 [info] model suicide-serving was loaded
> 2021-07-03 19:38:08,676 [info] Loaded ['suicide-serving']
hi


{'id': '7e8441fa8b4e429f8a75dc7c01761372',
 'model_name': 'suicide-serving',
 'outputs': [1]}

In [58]:
my_data = '''{"inputs":["It's such a hot day, I'd like to have ice cream and visit the park"]}'''

server = serving.to_mock_server()
server.test("/v2/models/suicide-serving/infer", body=my_data)

> 2021-07-03 19:39:13,061 [info] model suicide-serving was loaded
> 2021-07-03 19:39:13,063 [info] Loaded ['suicide-serving']
hi


{'id': '9d49bf632e38457ebf790655edec0205',
 'model_name': 'suicide-serving',
 'outputs': [0]}