In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from classifier import make_udf
import pandas as pd




In [2]:
spark = SparkSession.builder.appName('mysession').getOrCreate()

## Build Spark DataFrame

In [3]:
df_input = spark.read.parquet('data/input.parquet')

# Approach 1: Standard UDFs

## Single prediction

In [4]:
udf_predict = make_udf(multi_prediction=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+--------+
|input                                                                       |category|
+----------------------------------------------------------------------------+--------+
|is filestream lazy loaded in net                                            |.net    |
|programmatically launching standalone adobe flashplayer on linux/x11        |null    |
|encoding problem classic asp                                                |asp.net |
|c # winforms datagridview/sql compact negative integer in primary key column|c#      |
|suspending and notifying threads when there is work to do                   |java    |
|creating my own iterators                                                   |c#      |
|css `` see through '' background crazy navigation menu problem              |asp.net |
|sending email in net through gmail                                          |.net    |
|specify ordinals of c++ exporte

## Multiple prediction

In [5]:
udf_predict = make_udf(multi_prediction=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+----------------------+
|input                                                                       |category              |
+----------------------------------------------------------------------------+----------------------+
|is filestream lazy loaded in net                                            |[.net, c#, asp.net]   |
|programmatically launching standalone adobe flashplayer on linux/x11        |null                  |
|encoding problem classic asp                                                |[asp.net, asp.net-mvc]|
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]            |
|suspending and notifying threads when there is work to do                   |[java, c#]            |
|creating my own iterators                                                   |[c#, .net]            |
|css `` see through '' background crazy navigation menu problem              |[asp

# Performance

In [6]:
%timeit -n 10 df_output.sample(False,.10).show(10,False)

+--------------------------------------------------------------------------------------+---------------------+
|input                                                                                 |category             |
+--------------------------------------------------------------------------------------+---------------------+
|build tar file from directory in php without exec/passthru                            |[php]                |
|how to consume json web services from a windows client                                |[.net]               |
|python beyond the basics                                                              |[python]             |
|how to pass an unpersisted modified object from view back to controller without a form|null                 |
|how to return a page of results from sql                                              |[sql-server, c#, sql]|
|how do you quickly find the url for a win32 api on msdn                               |null                 |
|

+-----------------------------------------------------------------------------------+----------------------------------+
|input                                                                              |category                          |
+-----------------------------------------------------------------------------------+----------------------------------+
|how does traceroute work                                                           |[c#, .net]                        |
|sql query count with 0 count                                                       |[sql, sql-server, sql-server-2005]|
|how to disable visual studio macro `` tip '' balloon                               |[visual-studio]                   |
|how do i extract the version and path from an svn working copy into a nant variable|null                              |
|be notified when visual/logical child added/removed                                |[c#, c++]                         |
|css `` see through '' backgroun

+------------------------------------------------------------------------------------------+--------------------+
|input                                                                                     |category            |
+------------------------------------------------------------------------------------------+--------------------+
|how to disable visual studio macro `` tip '' balloon                                      |[visual-studio]     |
|implementing and enforcing coding standards                                               |[c++]               |
|how can i get a list of available wireless networks on linux                              |[python]            |
|what is the best way to see what files are locked in subversion                           |null                |
|be notified when visual/logical child added/removed                                       |[c#, c++]           |
|css `` see through '' background crazy navigation menu problem                         

+---------------------------------------------------------------------+-----------------+
|input                                                                |category         |
+---------------------------------------------------------------------+-----------------+
|how do you manage infragistics webgrid data from javascript/ajax code|[c#]             |
|prevent long word to add horizontal scroll to html view              |[asp.net]        |
|how do i add custom column to existing wss list template             |null             |
|how to effectively implement sessions in gae                         |null             |
|building flex projects in ant/nant                                   |[flex]           |
|how to disable visual studio macro `` tip '' balloon                 |[visual-studio]  |
|what is the aspnet_client folder for under the iis structure         |null             |
|numbering regex submatches                                           |[php]            |
|parsing t

+-------------------------------------------------------------------------------+----------------------+
|input                                                                          |category              |
+-------------------------------------------------------------------------------+----------------------+
|sql server and the guest account what is this for                              |[sql-server, sql]     |
|whats the best way to start using mylyn                                        |[c#, .net]            |
|are incrementers / decrementers var++ var etc thread safe                      |[c++]                 |
|best update method for mysql db                                                |[mysql, sql, database]|
|c # winforms datagridview/sql compact negative integer in primary key column   |[c#, .net]            |
|can you use an alias in the where clause in mysql                              |[mysql]               |
|what is the best way to determine the number of days i

+-------------------------------------------------------------------------------+-----------------+
|input                                                                          |category         |
+-------------------------------------------------------------------------------+-----------------+
|vector shape on stage appears over dynamic textfield                           |null             |
|mac iwork/pages automation                                                     |null             |
|building flex projects in ant/nant                                             |[flex]           |
|while clause in t sql that loops forever                                       |[sql, sql-server]|
|how can i create prototype methods like javascript in c # net                  |[c#, .net]       |
|what design pattern to use for user authentication in java                     |[java]           |
|what is the best way to determine the number of days in a month with javascript|[javascript]     |


+----------------------------------------------------------------------------+--------------------------+
|input                                                                       |category                  |
+----------------------------------------------------------------------------+--------------------------+
|how can i change html attribute names with jquery                           |[jquery, javascript, html]|
|should i have one class for every database i use                            |[java]                    |
|how to return a page of results from sql                                    |[sql-server, c#, sql]     |
|post from one controller action to another not redirect                     |null                      |
|how do i call net code c # /vb net from vbscript                            |[c#, .net]                |
|vertical text with jquery                                                   |[jquery, javascript]      |
|how can i determine the ip of my router/gatew

+----------------------------------------------------------------------------+----------------------------------+
|input                                                                       |category                          |
+----------------------------------------------------------------------------+----------------------------------+
|gantt chart controls on windows forms                                       |[windows, .net]                   |
|c # lambda expressions or delegates as a properties or arguments            |[c#]                              |
|post from one controller action to another not redirect                     |null                              |
|how to generate unit test code for methods                                  |[java, c#]                        |
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]                        |
|what design pattern to use for user authentication in java                  |[java]    

+--------------------------------------------------------------------------------------------------------+----------------------+
|input                                                                                                   |category              |
+--------------------------------------------------------------------------------------------------------+----------------------+
|c the definitive truth about rand random and arc4random                                                 |[c, c#]               |
|should i have one class for every database i use                                                        |[java]                |
|how does traceroute work                                                                                |[c#, .net]            |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]     |
|best update method for mysql db                                                          

+----------------------------------------------------------------------------------------------+--------------------------+
|input                                                                                         |category                  |
+----------------------------------------------------------------------------------------------+--------------------------+
|making an image greyscale with gd library                                                     |null                      |
|how can i change html attribute names with jquery                                             |[jquery, javascript, html]|
|getting odd error on net executenonquery                                                      |[.net, c#]                |
|why learn perl python ruby if the company is using c++ c # or java as the application language|[c++, c, python]          |
|c # lambda expressions or delegates as a properties or arguments                              |[c#]                      |
|how do 

+-------------------------------------------------------------+----------------------------------+
|input                                                        |category                          |
+-------------------------------------------------------------+----------------------------------+
|how can i change html attribute names with jquery            |[jquery, javascript, html]        |
|game programming and event handlers                          |null                              |
|sql query count with 0 count                                 |[sql, sql-server, sql-server-2005]|
|ms sql 2000 turn off logging during stored procedure         |[sql-server, sql]                 |
|how can i get a list of available wireless networks on linux |[python]                          |
|how do you quickly find the url for a win32 api on msdn      |null                              |
|how can i create prototype methods like javascript in c # net|[c#, .net]                        |
|convertin

+--------------------------------------------------------------------+--------------------+
|input                                                               |category            |
+--------------------------------------------------------------------+--------------------+
|transforming selected text with a hotkey                            |null                |
|vector shape on stage appears over dynamic textfield                |null                |
|carbide / symbian c++ change application icon                       |[c++]               |
|how to disable visual studio macro `` tip '' balloon                |[visual-studio]     |
|programmatically launching standalone adobe flashplayer on linux/x11|null                |
|ms sql 2000 turn off logging during stored procedure                |[sql-server, sql]   |
|doctype rss & html entities                                         |[html, asp.net, css]|
|what is the aspnet_client folder for under the iis structure        |null      

+------------------------------------------------------------------------------------+-----------------+
|input                                                                               |category         |
+------------------------------------------------------------------------------------+-----------------+
|how do you deal with connection strings when deploying an asp net site              |[asp.net]        |
|vector shape on stage appears over dynamic textfield                                |null             |
|how do you quickly find the url for a win32 api on msdn                             |null             |
|post from one controller action to another not redirect                             |null             |
|suspending and notifying threads when there is work to do                           |[java, c#]       |
|how do you maintain large t sql procedures                                          |[sql, sql-server]|
|how can i ban a whole company from my web site        