In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from classifier import predict_serie, make_pandas_udf
import pandas as pd




In [2]:
spark = SparkSession.builder.appName('mysession').getOrCreate()
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

## Build Spark DataFrame

In [3]:
df_input = spark.read.parquet('data/input.parquet')

### Take Pandas Sample

In [4]:
pdf_sample = df_input.sample(False,fraction=0.10,seed=12345).toPandas()
pdf_sample.head(5)

Unnamed: 0,input
0,is filestream lazy loaded in net
1,programmatically launching standalone adobe fl...
2,encoding problem classic asp
3,c # winforms datagridview/sql compact negative...
4,suspending and notifying threads when there is...


### Test `predict_serie`  function

**Single prediction, native vectorized inference**

In [5]:
predict_serie(pdf_sample.input,False,False).head(5).to_frame()

Unnamed: 0,0
0,.net
1,
2,asp.net
3,c#
4,java


**Multiple prediction, native vectorized inference**

In [6]:
predict_serie(pdf_sample.input,True,False).head(5).to_frame()

Unnamed: 0,0
0,"[.net, c#, asp.net]"
1,
2,"[asp.net, asp.net-mvc]"
3,"[c#, .net]"
4,"[java, c#]"


**Single prediction, rowwise inference**

In [7]:
predict_serie(pdf_sample.input,False,True).head(5).to_frame()

Unnamed: 0,input
0,.net
1,
2,asp.net
3,c#
4,java


**Multiple prediction, rowwise inference**

In [8]:
predict_serie(pdf_sample.input,True,True).head(5).to_frame()

Unnamed: 0,input
0,"[.net, c#, asp.net]"
1,
2,"[asp.net, asp.net-mvc]"
3,"[c#, .net]"
4,"[java, c#]"


**Performance comparison: Native vectorized vs. Rowwise**

In [9]:
%timeit -n 20 predict_serie(pdf_sample.input,True,False)

1.73 ms ± 151 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


In [10]:
%timeit -n 20 predict_serie(pdf_sample.input,True,True)

2.19 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 20 loops each)


# Approach 2.1: Pandas UDFs with (native) vectorized inference

## Single prediction

In [11]:
udf_predict = make_pandas_udf(multi_prediction=False,rowwise=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+--------+
|input                                                                       |category|
+----------------------------------------------------------------------------+--------+
|is filestream lazy loaded in net                                            |.net    |
|programmatically launching standalone adobe flashplayer on linux/x11        |null    |
|encoding problem classic asp                                                |asp.net |
|c # winforms datagridview/sql compact negative integer in primary key column|c#      |
|suspending and notifying threads when there is work to do                   |java    |
|creating my own iterators                                                   |c#      |
|css `` see through '' background crazy navigation menu problem              |asp.net |
|sending email in net through gmail                                          |.net    |
|specify ordinals of c++ exporte

## Multiple prediction

In [12]:
udf_predict = make_pandas_udf(multi_prediction=True,rowwise=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+----------------------+
|input                                                                       |category              |
+----------------------------------------------------------------------------+----------------------+
|is filestream lazy loaded in net                                            |[.net, c#, asp.net]   |
|programmatically launching standalone adobe flashplayer on linux/x11        |null                  |
|encoding problem classic asp                                                |[asp.net, asp.net-mvc]|
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]            |
|suspending and notifying threads when there is work to do                   |[java, c#]            |
|creating my own iterators                                                   |[c#, .net]            |
|css `` see through '' background crazy navigation menu problem              |[asp

# Approach 2.2: Pandas UDFs with rowwise inference (using Pandas' `apply` method)

## Single prediction

In [13]:
udf_predict = make_pandas_udf(multi_prediction=False,rowwise=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+--------+
|input                                                                       |category|
+----------------------------------------------------------------------------+--------+
|is filestream lazy loaded in net                                            |.net    |
|programmatically launching standalone adobe flashplayer on linux/x11        |null    |
|encoding problem classic asp                                                |asp.net |
|c # winforms datagridview/sql compact negative integer in primary key column|c#      |
|suspending and notifying threads when there is work to do                   |java    |
|creating my own iterators                                                   |c#      |
|css `` see through '' background crazy navigation menu problem              |asp.net |
|sending email in net through gmail                                          |.net    |
|specify ordinals of c++ exporte

## Multiple prediction

In [14]:
udf_predict = make_pandas_udf(multi_prediction=True,rowwise=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10,12345).show(10,False)

+----------------------------------------------------------------------------+----------------------+
|input                                                                       |category              |
+----------------------------------------------------------------------------+----------------------+
|is filestream lazy loaded in net                                            |[.net, c#, asp.net]   |
|programmatically launching standalone adobe flashplayer on linux/x11        |null                  |
|encoding problem classic asp                                                |[asp.net, asp.net-mvc]|
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]            |
|suspending and notifying threads when there is work to do                   |[java, c#]            |
|creating my own iterators                                                   |[c#, .net]            |
|css `` see through '' background crazy navigation menu problem              |[asp

# Performance comparison: Native vectorized vs. Rowwise

In [15]:
%%timeit -n 10 
udf_predict = make_pandas_udf(multi_prediction=True,rowwise=False)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10).show(10,False)

+--------------------------------------------------------------------------------------------------------+----------------------------------+
|input                                                                                                   |category                          |
+--------------------------------------------------------------------------------------------------------+----------------------------------+
|how to consume json web services from a windows client                                                  |[.net]                            |
|whats the best way to start using mylyn                                                                 |[c#, .net]                        |
|how to pass an unpersisted modified object from view back to controller without a form                  |null                              |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]                 |
|sql q

+----------------------------------------------------------------------------+---------------------+
|input                                                                       |category             |
+----------------------------------------------------------------------------+---------------------+
|making an image greyscale with gd library                                   |null                 |
|whats the best way to start using mylyn                                     |[c#, .net]           |
|carbide / symbian c++ change application icon                               |[c++]                |
|how to return a page of results from sql                                    |[sql-server, c#, sql]|
|is it possible to define in a dependent dll s application config            |[c#, .net]           |
|c # winforms datagridview/sql compact negative integer in primary key column|[c#, .net]           |
|eclipse hide paths in the `` open resource '' dialog                        |null         

+---------------------------------------------------------------------+----------------------------------+
|input                                                                |category                          |
+---------------------------------------------------------------------+----------------------------------+
|making an image greyscale with gd library                            |null                              |
|how do you manage infragistics webgrid data from javascript/ajax code|[c#]                              |
|how to consume json web services from a windows client               |[.net]                            |
|prevent long word to add horizontal scroll to html view              |[asp.net]                         |
|sql query count with 0 count                                         |[sql, sql-server, sql-server-2005]|
|encoding problem classic asp                                         |[asp.net, asp.net-mvc]            |
|what is the best way to see what fil

+----------------------------------------------------------------------------------------------------------------+----------------------+
|input                                                                                                           |category              |
+----------------------------------------------------------------------------------------------------------------+----------------------+
|should i have one class for every database i use                                                                |[java]                |
|how to implement a singleton in c #                                                                             |[c#]                  |
|eclipse hide paths in the `` open resource '' dialog                                                            |null                  |
|daemon threads explanation                                                                                      |null                  |
|which css tag creates a box like 

+--------------------------------------------------------------------------------------------------------+---------------------+
|input                                                                                                   |category             |
+--------------------------------------------------------------------------------------------------------+---------------------+
|transforming selected text with a hotkey                                                                |null                 |
|build tar file from directory in php without exec/passthru                                              |[php]                |
|how do i add custom column to existing wss list template                                                |null                 |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]    |
|building flex projects in ant/nant                                                              

+--------------------------------------------------------------------------------------------------------+-------------------+
|input                                                                                                   |category           |
+--------------------------------------------------------------------------------------------------------+-------------------+
|is filestream lazy loaded in net                                                                        |[.net, c#, asp.net]|
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]  |
|why learn perl python ruby if the company is using c++ c # or java as the application language          |[c++, c, python]   |
|percentages of subtotal in a report                                                                     |[c++, java]        |
|authenticating against active directory with java on linux                                              |[java

+--------------------------------------------------------------------------------------------------------+---------------------+
|input                                                                                                   |category             |
+--------------------------------------------------------------------------------------------------------+---------------------+
|build tar file from directory in php without exec/passthru                                              |[php]                |
|game programming and event handlers                                                                     |null                 |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]    |
|while clause in t sql that loops forever                                                                |[sql, sql-server]    |
|c # lambda expressions or delegates as a properties or arguments                                

+--------------------------------------------------------------+-------------------+
|input                                                         |category           |
+--------------------------------------------------------------+-------------------+
|sql server and the guest account what is this for             |[sql-server, sql]  |
|build tar file from directory in php without exec/passthru    |[php]              |
|how to consume json web services from a windows client        |[.net]             |
|is filestream lazy loaded in net                              |[.net, c#, asp.net]|
|how do i add custom column to existing wss list template      |null               |
|should i have one class for every database i use              |[java]             |
|db side encryption via nhibernate                             |null               |
|is there a way to asynchronously filter an ilist              |null               |
|class methods as event handlers in javascript                 |[

+--------------------------------------------------------------------------------------------------------+----------------------+
|input                                                                                                   |category              |
+--------------------------------------------------------------------------------------------------------+----------------------+
|how do i add custom column to existing wss list template                                                |null                  |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]     |
|best update method for mysql db                                                                         |[mysql, sql, database]|
|the necessity of hiding the salt for a hash                                                             |[c++]                 |
|eclipse hide paths in the `` open resource '' dialog                                     

+----------------------------------------------------------------------------------------------+-----------------+
|input                                                                                         |category         |
+----------------------------------------------------------------------------------------------+-----------------+
|sql server and the guest account what is this for                                             |[sql-server, sql]|
|build tar file from directory in php without exec/passthru                                    |[php]            |
|how to effectively implement sessions in gae                                                  |null             |
|getting odd error on net executenonquery                                                      |[.net, c#]       |
|why learn perl python ruby if the company is using c++ c # or java as the application language|[c++, c, python] |
|c # lambda expressions or delegates as a properties or arguments               

+-------------------------------------------------------------------------------+---------------------+
|input                                                                          |category             |
+-------------------------------------------------------------------------------+---------------------+
|python beyond the basics                                                       |[python]             |
|are incrementers / decrementers var++ var etc thread safe                      |[c++]                |
|programmatically launching standalone adobe flashplayer on linux/x11           |null                 |
|ms sql 2000 turn off logging during stored procedure                           |[sql-server, sql]    |
|how to return a page of results from sql                                       |[sql-server, c#, sql]|
|is it possible to define in a dependent dll s application config               |[c#, .net]           |
|what is the best way to determine the number of days in a month

+------------------------------------------------------------------------+--------------------+
|input                                                                   |category            |
+------------------------------------------------------------------------+--------------------+
|making an image greyscale with gd library                               |null                |
|transforming selected text with a hotkey                                |null                |
|build tar file from directory in php without exec/passthru              |[php]               |
|python beyond the basics                                                |[python]            |
|doctype rss & html entities                                             |[html, asp.net, css]|
|how to generate unit test code for methods                              |[java, c#]          |
|what is the best way to see what files are locked in subversion         |null                |
|creating my own iterators              

In [16]:
%%timeit -n 10 
udf_predict = make_pandas_udf(multi_prediction=True,rowwise=True)
df_output = df_input.withColumn("category",udf_predict(col("input")))
df_output.sample(False,.10).show(10,False)

+--------------------------------------------------------------------------------------+--------------------+
|input                                                                                 |category            |
+--------------------------------------------------------------------------------------+--------------------+
|how to consume json web services from a windows client                                |[.net]              |
|prevent long word to add horizontal scroll to html view                               |[asp.net]           |
|how do you deal with connection strings when deploying an asp net site                |[asp.net]           |
|vector shape on stage appears over dynamic textfield                                  |null                |
|how to pass an unpersisted modified object from view back to controller without a form|null                |
|c # in linux environment                                                              |[c#]                |
|what is t

+--------------------------------------------------------------------+---------------------+
|input                                                               |category             |
+--------------------------------------------------------------------+---------------------+
|transforming selected text with a hotkey                            |null                 |
|is filestream lazy loaded in net                                    |[.net, c#, asp.net]  |
|programmatically launching standalone adobe flashplayer on linux/x11|null                 |
|how to return a page of results from sql                            |[sql-server, c#, sql]|
|c # in linux environment                                            |[c#]                 |
|how can i create prototype methods like javascript in c # net       |[c#, .net]           |
|suspending and notifying threads when there is work to do           |[java, c#]           |
|can you use an alias in the where clause in mysql                   |

+---------------------------------------------------------------------------------------------------------------------------+-----------------+
|input                                                                                                                      |category         |
+---------------------------------------------------------------------------------------------------------------------------+-----------------+
|authoritative source on xml sig                                                                                            |[.net, php, c#]  |
|programmatically launching standalone adobe flashplayer on linux/x11                                                       |null             |
|converting svg to png using c #                                                                                            |[c#]             |
|is there a way to asynchronously filter an ilist                                                                           |null       

+------------------------------------------------------------------------------------------+----------------------+
|input                                                                                     |category              |
+------------------------------------------------------------------------------------------+----------------------+
|python beyond the basics                                                                  |[python]              |
|how does traceroute work                                                                  |[c#, .net]            |
|implementing and enforcing coding standards                                               |[c++]                 |
|how to generate unit test code for methods                                                |[java, c#]            |
|encoding problem classic asp                                                              |[asp.net, asp.net-mvc]|
|is it possible to define in a dependent dll s application config       

+--------------------------------------------------------------+---------------+
|input                                                         |category       |
+--------------------------------------------------------------+---------------+
|getting odd error on net executenonquery                      |[.net, c#]     |
|authoritative source on xml sig                               |[.net, php, c#]|
|post from one controller action to another not redirect       |null           |
|c # in linux environment                                      |[c#]           |
|suspending and notifying threads when there is work to do     |[java, c#]     |
|setting the height of a div dynamically                       |[c#]           |
|how would you attack this polymorphism string building problem|[c#]           |
|numbering regex submatches                                    |[php]          |
|using lists in c #                                            |[c#]           |
|decoding chunked http with 

+-------------------------------------------------------------------------------+----------------------------------+
|input                                                                          |category                          |
+-------------------------------------------------------------------------------+----------------------------------+
|sql server and the guest account what is this for                              |[sql-server, sql]                 |
|prevent long word to add horizontal scroll to html view                        |[asp.net]                         |
|is filestream lazy loaded in net                                               |[.net, c#, asp.net]               |
|carbide / symbian c++ change application icon                                  |[c++]                             |
|sql query count with 0 count                                                   |[sql, sql-server, sql-server-2005]|
|suspending and notifying threads when there is work to do      

+----------------------------------------------------------------------------------------------+--------------------------+
|input                                                                                         |category                  |
+----------------------------------------------------------------------------------------------+--------------------------+
|how can i change html attribute names with jquery                                             |[jquery, javascript, html]|
|game programming and event handlers                                                           |null                      |
|best update method for mysql db                                                               |[mysql, sql, database]    |
|why learn perl python ruby if the company is using c++ c # or java as the application language|[c++, c, python]          |
|suspending and notifying threads when there is work to do                                     |[java, c#]                |
|image a

+-------------------------------------------------------------+---------------------+
|input                                                        |category             |
+-------------------------------------------------------------+---------------------+
|getting odd error on net executenonquery                     |[.net, c#]           |
|carbide / symbian c++ change application icon                |[c++]                |
|are incrementers / decrementers var++ var etc thread safe    |[c++]                |
|building flex projects in ant/nant                           |[flex]               |
|how to return a page of results from sql                     |[sql-server, c#, sql]|
|how can i create prototype methods like javascript in c # net|[c#, .net]           |
|daemon threads explanation                                   |null                 |
|is there a way to asynchronously filter an ilist             |null                 |
|ant and the available task what if something is not a

+-------------------------------------------------------------------------------+---------------+
|input                                                                          |category       |
+-------------------------------------------------------------------------------+---------------+
|how to consume json web services from a windows client                         |[.net]         |
|should i have one class for every database i use                               |[java]         |
|how to disable visual studio macro `` tip '' balloon                           |[visual-studio]|
|what is the best way to determine the number of days in a month with javascript|[javascript]   |
|casting array of objects which implement interface ifoo to ifoo in c #         |[c#, c]        |
|why is app_offline failing to work as soon as you it starts loading dlls       |[.net, c#]     |
|unix socket implementation for java                                            |[java]         |
|custom properties i

+--------------------------------------------------------------------------------------------------------+--------------------+
|input                                                                                                   |category            |
+--------------------------------------------------------------------------------------------------------+--------------------+
|c the definitive truth about rand random and arc4random                                                 |[c, c#]             |
|python beyond the basics                                                                                |[python]            |
|can sql server express be used to effectively administrate a sql server standard/enterprise installation|[sql-server, sql]   |
|while clause in t sql that loops forever                                                                |[sql, sql-server]   |
|post from one controller action to another not redirect                                                

+-----------------------------------------------------------------------------------+-----------+
|input                                                                              |category   |
+-----------------------------------------------------------------------------------+-----------+
|making an image greyscale with gd library                                          |null       |
|python beyond the basics                                                           |[python]   |
|how do you deal with connection strings when deploying an asp net site             |[asp.net]  |
|are incrementers / decrementers var++ var etc thread safe                          |[c++]      |
|best way to use a db table as a message/job queue                                  |[java]     |
|eclipse hide paths in the `` open resource '' dialog                               |null       |
|how do i extract the version and path from an svn working copy into a nant variable|null       |
|percentages of subt

+---------------------------------------------------------------------------------------------------------------------------+--------------------+
|input                                                                                                                      |category            |
+---------------------------------------------------------------------------------------------------------------------------+--------------------+
|the necessity of hiding the salt for a hash                                                                                |[c++]               |
|c # lambda expressions or delegates as a properties or arguments                                                           |[c#]                |
|best way to use a db table as a message/job queue                                                                          |[java]              |
|is there a way to asynchronously filter an ilist                                                                     