In [0]:
dbutils.widgets.removeAll()

In [0]:
import pandas as pd


dfs_train = spark.read.format('csv').option("inferSchema", 'false').option("header", 'true').option("sep", ',').load('/FileStore/tables/guide/qvt_train.csv')
dfs_test = spark.read.format('csv').option("inferSchema", 'false').option("header", 'true').option("sep", ',').load('/FileStore/tables/guide/qvt_test.csv')

In [0]:
df_train = dfs_train.toPandas()
df_train.sample(10)

Unnamed: 0,queries,is_question
19731,What has been operated since 1562 at Transvaal...,1.0
128963,What do clinicians ignore the status of when t...,1.0
416630,materials system use,0.0
284825,nazi russian leaders meet discuss poland,0.0
32895,What is the spoken word of Chasidic Judiasm?,1.0
4286,How many years does the story of To Kill a Moc...,1.0
345624,join atlanta hawks,0.0
227111,percentage aeta women hunt,0.0
200635,Did he have any wins during this time?,1.0
268333,society friends also known name,0.0


In [0]:
df_test = dfs_test.toPandas()
df_test.sample(10)

Unnamed: 0,queries,is_question
2183,kustomer,0.0
970,calls report,0.0
2941,call bar,0.0
586,remove agent,0.0
2480,auto away,0.0
2799,call back reporting,0.0
1615,voice message,0.0
1519,call barging,0.0
1469,callbar salesforce,0.0
2915,new agent,0.0


In [0]:
df_train = df_train.dropna()
df_train = df_train.drop(df_train.sample(frac=.5).index)

df_test = df_test.dropna()
df_test = df_test.drop(df_test.sample(frac=.5).index)

In [0]:
import mlflow
import mlflow.tensorflow

import tensorflow as tf
import tensorflow_hub as hub


In [0]:
import nltk

def parse_query(qry):
    tokenizer = nltk.RegexpTokenizer(r"\w+")
    words = tokenizer.tokenize(qry)
    new_words = [word.lower() for word in words]
    return ' '.join(new_words)

def xyfromdf(df, embed):
    qrs = [parse_query(q) for q in df['queries']]
    X = embed(qrs)

    X = tf.make_ndarray(tf.make_tensor_proto(X))
    y = df['is_question'].values

    return X, y

In [0]:
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score

mlflow.set_experiment('/Users/pedro.rodrigues@talkdesk.com/pr_qvt')

with mlflow.start_run() as run:
  tfhub_enc = "https://tfhub.dev/google/universal-sentence-encoder/4"
  
  embed = hub.load(tfhub_enc)
  
  mlflow.log_param("encoder", tfhub_enc)
  
  X_train, y_train = xyfromdf(df_train, embed)
  X_test, y_test = xyfromdf(df_test, embed)
  
  clf_solver = 'lbfgs'

  mlflow.log_param("solver", clf_solver)
  clf = LogisticRegression(solver=clf_solver).fit(X_train, y_train)

  score_train = clf.score(X_train, y_train)
  print("Train score: %s" % score_train)

  score_test = clf.score(X_test, y_test)
  print("Test score: %s" % score_test)

  mlflow.log_metric("score_train", score_train)
  mlflow.log_metric("score_test", score_test)
  mlflow.sklearn.log_model(clf, "model")
  print("Model saved in run %s" % mlflow.active_run().info.run_uuid)

In [0]:
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

#mlflow.log_metric("confusion_matrix_test", cm)
print(cm)

In [0]:
import numpy as np

def single_predict(q, clf):
    qrs = [parse_query(q)]
    X = embed(qrs)

    X = tf.make_ndarray(tf.make_tensor_proto(X))
    
    p = clf.predict(X)[0]
    
    return float(p) > .5

In [0]:
qry = 'am i a muppet'
print('the query is a question:', single_predict(qry, clf))