In [1]:
import sys
sys.path.insert(0, "..")

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pandas as pd

In [3]:
spark = SparkSession.builder.master("local[4]").appName('mysession').getOrCreate()

In [4]:
spark.sparkContext.addFile('../models/ft_tuned.ftz')
spark.sparkContext.addPyFile('../classifier.py')

In [5]:
from classifier import make_udf




## Build Spark DataFrame

In [6]:
df_input = spark.read.parquet('../data/input.parquet')

# Approach 1: Standard UDFs

## Single prediction

In [7]:
udf_predict = make_udf(multi_prediction=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+--------+
|input                                                                       |category|
+----------------------------------------------------------------------------+--------+
|is filestream lazy loaded in net                                            |.net    |
|programmatically launching standalone adobe flashplayer on linux/x11        |null    |
|encoding problem classic asp                                                |asp.net |
|c # winforms datagridview/sql compact negative integer in primary key column|c#      |
|suspending and notifying threads when there is work to do                   |java    |
|creating my own iterators                                                   |c#      |
|css `` see through '' background crazy navigation menu problem              |asp.net |
|sending email in net through gmail                                          |.net    |
|specify ordinals of c++ exporte

## Multiple prediction

In [8]:
udf_predict = make_udf(multi_prediction=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+----------------------+
|input                                                                       |category              |
+----------------------------------------------------------------------------+----------------------+
|is filestream lazy loaded in net                                            |[.net, c#, asp.net]   |
|programmatically launching standalone adobe flashplayer on linux/x11        |null                  |
|encoding problem classic asp                                                |[asp.net, asp.net-mvc]|
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]            |
|suspending and notifying threads when there is work to do                   |[java, c#]            |
|creating my own iterators                                                   |[c#, .net]            |
|css `` see through '' background crazy navigation menu problem              |[asp

# Performance

In [9]:
%timeit -n 10 df_output.sample(False,.10).show(10)

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|sql server and th...|   [sql-server, sql]|
|whats the best wa...|          [c#, .net]|
|how does tracerou...|          [c#, .net]|
|can sql server ex...|   [sql-server, sql]|
|how to return a p...|[sql-server, c#, ...|
|what is the aspne...|                null|
|eclipse text comp...|     [eclipse, java]|
|how would you att...|                [c#]|
|sending email in ...| [.net, c#, asp.net]|
|specify ordinals ...|               [c++]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+-----------------+
|               input|         category|
+--------------------+-----------------+
|sql server and th...|[sql-server, sql]|
|gantt chart contr...|  [windows, .net]|
|how do you manage...|             [c#]|
|how do i add cust...|             null|
|implementing and ...|            [c++]|
|why learn perl py...| [c++, c

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|is filestream laz...| [.net, c#, asp.net]|
|how to effectivel...|                null|
|how does tracerou...|          [c#, .net]|
|carbide / symbian...|               [c++]|
|sql query count w...|[sql, sql-server,...|
|programmatically ...|                null|
|is there a way to...|                null|
|setting the heigh...|                [c#]|
|image archive vs ...|               [css]|
|is it true that t...|          [.net, c#]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|c the definitive ...|             [c, c#]|
|how to pass an un...|                null|
|how does tracerou...|          [c#, .net]|
|how to disable vi...|     [visual-studio]|
|how can i get a l...|            [python]|
|post 

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i send an...|               [php]|
|are incrementers ...|               [c++]|
|while clause in t...|   [sql, sql-server]|
|vertical text wit...|[jquery, javascript]|
|how do i focus a ...|                null|
|be notified when ...|           [c#, c++]|
|casting array of ...|             [c#, c]|
|thotkey with win ...|                [c#]|
|unix socket imple...|              [java]|
|anybody know why ...|   [sql-server, sql]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i change ...|[jquery, javascri...|
|c the definitive ...|             [c, c#]|
|gantt chart contr...|     [windows, .net]|
|doctype rss & htm...|[html, asp.net, css]|
|why learn perl py...|    [c++, c, python]|
|how c

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|vector shape on s...|                null|
|best update metho...|[mysql, sql, data...|
|programmatically ...|                null|
|implementing and ...|               [c++]|
|c # lambda expres...|                [c#]|
|c # in linux envi...|                [c#]|
|converting svg to...|                [c#]|
|encoding problem ...|[asp.net, asp.net...|
|is it possible to...|          [c#, .net]|
|eclipse text comp...|     [eclipse, java]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+-----------------+
|               input|         category|
+--------------------+-----------------+
|how to effectivel...|             null|
|mac iwork/pages a...|             null|
|carbide / symbian...|            [c++]|
|how to implement ...|             [c#]|
|ms sql 2000 turn ...|[sql-server, sql]|
|why learn perl py...| [c++, c

+--------------------+------------+
|               input|    category|
+--------------------+------------+
|making an image g...|        null|
|how can i send an...|       [php]|
|how do you manage...|        [c#]|
|game programming ...|        null|
|how do you quickl...|        null|
|c # in linux envi...|        [c#]|
|how can i create ...|  [c#, .net]|
|creating my own i...|  [c#, .net]|
|actionscript3 to ...|[javascript]|
|what techniques c...|        [c#]|
+--------------------+------------+
only showing top 10 rows

+--------------------+------------+
|               input|    category|
+--------------------+------------+
|c the definitive ...|     [c, c#]|
|build tar file fr...|       [php]|
|c # lambda expres...|        [c#]|
|eclipse hide path...|        null|
|how can i determi...|      [java]|
|should i provide ...|      [java]|
|class methods as ...|[javascript]|
|css `` see throug...|   [asp.net]|
|how to serialize ...|        [c#]|
|can you set or wh...|        [c#]|
+-

+--------------------+---------------+
|               input|       category|
+--------------------+---------------+
|build tar file fr...|          [php]|
|how do you deal w...|      [asp.net]|
|building flex pro...|         [flex]|
|post from one con...|           null|
|is it true that t...|     [.net, c#]|
|what s the term f...|           null|
|how to serialize ...|           [c#]|
|can you set or wh...|           [c#]|
|which css tag cre...|           null|
|why is visual stu...|[visual-studio]|
+--------------------+---------------+
only showing top 10 rows

+--------------------+------------+
|               input|    category|
+--------------------+------------+
|making an image g...|        null|
|prevent long word...|   [asp.net]|
|how do you deal w...|   [asp.net]|
|building flex pro...|      [flex]|
|how can i get a l...|    [python]|
|how to generate u...|  [java, c#]|
|how can i create ...|  [c#, .net]|
|is it possible to...|  [c#, .net]|
|class methods as ...|[javascrip