In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from classifier import make_udf
import pandas as pd




In [2]:
spark = SparkSession.builder.appName('mysession').getOrCreate()

## Build Spark DataFrame

In [3]:
df_input = spark.read.parquet('data/input.parquet')

# Approach 1: Standard UDFs

## Single prediction

In [4]:
udf_predict = make_udf(multi_prediction=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+--------+
|input                                                                       |category|
+----------------------------------------------------------------------------+--------+
|is filestream lazy loaded in net                                            |.net    |
|programmatically launching standalone adobe flashplayer on linux/x11        |null    |
|encoding problem classic asp                                                |asp.net |
|c # winforms datagridview/sql compact negative integer in primary key column|c#      |
|suspending and notifying threads when there is work to do                   |java    |
|creating my own iterators                                                   |c#      |
|css `` see through '' background crazy navigation menu problem              |asp.net |
|sending email in net through gmail                                          |.net    |
|specify ordinals of c++ exporte

## Multiple prediction

In [5]:
udf_predict = make_udf(multi_prediction=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+----------------------+
|input                                                                       |category              |
+----------------------------------------------------------------------------+----------------------+
|is filestream lazy loaded in net                                            |[.net, c#, asp.net]   |
|programmatically launching standalone adobe flashplayer on linux/x11        |null                  |
|encoding problem classic asp                                                |[asp.net, asp.net-mvc]|
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]            |
|suspending and notifying threads when there is work to do                   |[java, c#]            |
|creating my own iterators                                                   |[c#, .net]            |
|css `` see through '' background crazy navigation menu problem              |[asp

# Performance

In [6]:
%timeit -n 10 df_output.sample(False,.10).show(10)

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i change ...|[jquery, javascri...|
|can sql server ex...|   [sql-server, sql]|
|sql query count w...|[sql, sql-server,...|
|authoritative sou...|     [.net, php, c#]|
|is it possible to...|          [c#, .net]|
|what design patte...|              [java]|
|setting the heigh...|                [c#]|
|stopping msi from...|          [.net, c#]|
|sending email in ...| [.net, c#, asp.net]|
|what is the simpl...|            [python]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i change ...|[jquery, javascri...|
|c the definitive ...|             [c, c#]|
|gantt chart contr...|     [windows, .net]|
|whats the best wa...|          [c#, .net]|
|carbide / symbian...|               [c++]|
|what 

+--------------------+----------------+
|               input|        category|
+--------------------+----------------+
|transforming sele...|            null|
|why learn perl py...|[c++, c, python]|
|c # lambda expres...|            [c#]|
|best way to use a...|          [java]|
|what is the aspne...|            null|
|what is the best ...|    [javascript]|
|setting the heigh...|            [c#]|
|numbering regex s...|           [php]|
|casting array of ...|         [c#, c]|
|actionscript3 to ...|    [javascript]|
+--------------------+----------------+
only showing top 10 rows

+--------------------+-----------------+
|               input|         category|
+--------------------+-----------------+
|whats the best wa...|       [c#, .net]|
|should i have one...|           [java]|
|why learn perl py...| [c++, c, python]|
|db side encryptio...|             null|
|suspending and no...|       [java, c#]|
|what design patte...|           [java]|
|creating my own i...|       [c#, .net]|
|par

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how do you manage...|                [c#]|
|vector shape on s...|                null|
|are incrementers ...|               [c++]|
|sql query count w...|[sql, sql-server,...|
|post from one con...|                null|
|is it possible to...|          [c#, .net]|
|c # winforms data...|          [c#, .net]|
|daemon threads ex...|                null|
|image archive vs ...|               [css]|
|stopping msi from...|          [.net, c#]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i change ...|[jquery, javascri...|
|whats the best wa...|          [c#, .net]|
|implementing and ...|               [c++]|
|how can i get a l...|            [python]|
|how do you quickl...|                null|
|is th

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|while clause in t...|   [sql, sql-server]|
|encoding problem ...|[asp.net, asp.net...|
|eclipse hide path...|                null|
|ant and the avail...|              [java]|
|class methods as ...|        [javascript]|
|what is the simpl...|            [python]|
|  using lists in c #|                [c#]|
|rendered pixel wi...|                null|
|why is app_offlin...|          [.net, c#]|
|unicode vs str de...|            [python]|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+-------------------+
|               input|           category|
+--------------------+-------------------+
|c the definitive ...|            [c, c#]|
|the necessity of ...|              [c++]|
|eclipse text comp...|    [eclipse, java]|
|how can i determi...|             [java]|
|what s the term f...|               null|
|stopping msi 

+--------------------+--------------------+
|               input|            category|
+--------------------+--------------------+
|how can i change ...|[jquery, javascri...|
|how to consume js...|              [.net]|
|mac iwork/pages a...|                null|
|are incrementers ...|               [c++]|
|how to disable vi...|     [visual-studio]|
|while clause in t...|   [sql, sql-server]|
|why learn perl py...|    [c++, c, python]|
|db side encryptio...|                null|
|daemon threads ex...|                null|
|how do i extract ...|                null|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+-----------------+
|               input|         category|
+--------------------+-----------------+
|python beyond the...|         [python]|
|how to effectivel...|             null|
|game programming ...|             null|
|ms sql 2000 turn ...|[sql-server, sql]|
|c # lambda expres...|             [c#]|
|how do you quickl...|        

+--------------------+---------------+
|               input|       category|
+--------------------+---------------+
|build tar file fr...|          [php]|
|getting odd error...|     [.net, c#]|
|are incrementers ...|          [c++]|
|what is the aspne...|           null|
|daemon threads ex...|           null|
|eclipse text comp...|[eclipse, java]|
|be notified when ...|      [c#, c++]|
|numbering regex s...|          [php]|
|how do i calculat...|           null|
|how to add a `` d...|         [java]|
+--------------------+---------------+
only showing top 10 rows

406 ms ± 31.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
