In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import Row
import fasttext

In [2]:
spark = SparkSession.builder.appName('mysession').getOrCreate()

## Build Spark DataFrame

In [3]:
df_input = spark.read.parquet('data/input.parquet').repartition(20)

# Approach 3: RDD's mapPartitions

In [4]:
def fn_partition(iterator,multi_prediction=False):
    
    def get_predictions(sentence,threshold=0.10,k=3):
        '''
        Function to get a list with predictions for a sentence, given a fastText model
        and given one of: a probability threshold, or a desired number k of top-k predictions.
        '''
        if threshold:
            labels, probs = model.predict(sentence.lower(),k=k)
            candidates = map(lambda y: y[0], filter(lambda x: x[1] >= threshold, zip(labels, probs)))
        else:
            candidates = model.predict(sentence.lower(),k=k)[0]
        output = list(map(lambda x: x.replace("__label__",""), candidates))
        if len(output)==0:
            return None
        else:
            return output[0] if k==1 else output
    
    model = fasttext.load_model('models/ft_tuned.ftz')
    
    for record in iterator:
        if not multi_prediction:
            yield Row(category=get_predictions(record['input'],k=1),input=record['input'])
        else: 
            yield Row(category=get_predictions(record['input'],k=3),input=record['input'])

## Single prediction

In [5]:
df_output = df_input.rdd.mapPartitions(lambda partition: fn_partition(partition,False)).toDF()
df_output.sample(False,.10,12345).show(10,False)

+----------+----------------------------------------------------------------------------------------------+
|category  |input                                                                                         |
+----------+----------------------------------------------------------------------------------------------+
|sql-server|how do i set a sql server script s timeout from within the script                             |
|.net      |embedding intellisense xml documentation in assembly                                          |
|asp.net   |css `` see through '' background crazy navigation menu problem                                |
|c#        |add custom textboxcell to a datagridview control that contains a button to open the filedialog|
|c#        |how do you bind in xaml to a dynamic xpath                                                    |
|java      |using noweb on a large java project                                                           |
|null      |how do i slice a

## Multiple prediction

In [6]:
df_output = df_input.rdd.mapPartitions(lambda partition: fn_partition(partition,True)).toDF()
df_output.sample(False,.10,12345).show(10,False)

+----------------------+----------------------------------------------------------------------------------------------+
|category              |input                                                                                         |
+----------------------+----------------------------------------------------------------------------------------------+
|[sql-server, sql]     |how do i set a sql server script s timeout from within the script                             |
|[.net, c#]            |embedding intellisense xml documentation in assembly                                          |
|[asp.net]             |css `` see through '' background crazy navigation menu problem                                |
|[c#, .net]            |add custom textboxcell to a datagridview control that contains a button to open the filedialog|
|[c#]                  |how do you bind in xaml to a dynamic xpath                                                    |
|[java]                |using noweb on a

# Performance

In [7]:
%timeit -n 10 df_output.sample(False,.10).show(10,False)

+-------------------------+--------------------------------------------------------------------+
|category                 |input                                                               |
+-------------------------+--------------------------------------------------------------------+
|[c#]                     |c # 4 0 feedback                                                    |
|[asp.net, c#, .net]      |how do i update a gridview from a page method from code behind      |
|null                     |game programming and event handlers                                 |
|null                     |regex to match against something that is not a specific substring   |
|[css, html, asp.net]     |css and lack of constants/variables issue                           |
|null                     |std wcout to console window in xcode                                |
|[.net, c#, visual-studio]|c # net 3 0/3 5 features in 2 0 using visual studio 2008            |
|null                     |wha

+-----------------------------+----------------------------------------------------------------------------------------------+
|category                     |input                                                                                         |
+-----------------------------+----------------------------------------------------------------------------------------------+
|[c++]                        |using boost shared_ptr in a library s public interface                                        |
|[java]                       |how do i skip items when tabbing without using tabindex                                       |
|[c++]                        |why does the bitconverter return bytes and how can i get the bits then                        |
|[c++]                        |can not execute program if using boost c++ libraries in debug version on winxp                |
|[sql-server-2005, sql, mysql]|database schema design                                                          

+---------------------+----------------------------------------------------------------------------+
|category             |input                                                                       |
+---------------------+----------------------------------------------------------------------------+
|[.net, c#, asp.net]  |any tool that can upgrade net 1 1 winform code to net 2 0 `` style ''       |
|[jquery, javascript] |vertical text with jquery                                                   |
|[asp.net]            |css `` see through '' background crazy navigation menu problem              |
|[javascript, php]    |passing a value from php to javascript                                      |
|[visual-studio, .net]|how to get t4 files to build in visual studio                               |
|[silverlight]        |silverlight databinding error works in wpf though                           |
|[c#]                 |in c # or any language what is/are your favourite way of removing re

+----------------------+----------------------------------------------------------------------------------------------------------------+
|category              |input                                                                                                           |
+----------------------+----------------------------------------------------------------------------------------------------------------+
|[c#]                  |use type of object in hql where clause                                                                          |
|[c#, .net]            |is there a fast way to transfer all the variables of one identical object into another in c #                   |
|null                  |async method call                                                                                               |
|[c#, .net]            |how can i programmatically determine if my workstation is locked                                                |
|[c#]                  |how to imp

+--------------------+--------------------------------------------------------------------------------+
|category            |input                                                                           |
+--------------------+--------------------------------------------------------------------------------+
|[asp.net]           |asp net ajax textbox readonly state                                             |
|[asp.net]           |virtual directory in asp net project                                            |
|[c#, c]             |is there an alternative for sleep in c                                          |
|[java]              |when using ant how can i define a task only if i have some specific java version|
|[jquery, javascript]|looping over elements in jquery                                                 |
|[sql-server, sql]   |custom aggregate functions in ms sql server                                     |
|[silverlight]       |silverlight databinding error works in wpf

+-------------------+-------------------------------------------------------------------------------+
|category           |input                                                                          |
+-------------------+-------------------------------------------------------------------------------+
|[.net, c#, asp.net]|any tool that can upgrade net 1 1 winform code to net 2 0 `` style ''          |
|[asp.net]          |asp net ajax textbox readonly state                                            |
|[sql-server, sql]  |how do i set a sql server script s timeout from within the script              |
|[c#, asp.net]      |sensitive data in viewstate                                                    |
|[asp.net]          |virtual directory in asp net project                                           |
|[javascript]       |what is the best way to determine the number of days in a month with javascript|
|[php]              |a delphi/freepascal lib or function that emulates the php s f

+-----------------------------+--------------------------------------------------------------------------+
|category                     |input                                                                     |
+-----------------------------+--------------------------------------------------------------------------+
|[ruby, ruby-on-rails]        |what are the limits of ruby on rails                                      |
|null                         |what is the overhead cost associated with ioc containers like structuremap|
|[c#, java]                   |equivalent of java s concurrenthashmap in c #                             |
|[java]                       |saving java object graphs as xml file                                     |
|[sql-server-2005, sql, mysql]|database schema design                                                    |
|[c#, c]                      |is there an alternative for sleep in c                                    |
|null                         |std wc

+----------------------------------+-----------------------------------------------------------------------------+
|category                          |input                                                                        |
+----------------------------------+-----------------------------------------------------------------------------+
|null                              |what is the best way to implement a singleton pattern class in actionscript 3|
|[php]                             |how can i send an array to php through ajax                                  |
|[asp.net]                         |why does asp net webforms need the runat `` server '' attribute              |
|[.net, asp.net]                   |vb net importing classes                                                     |
|null                              |insert dates in the return from a query where there is none                  |
|[c#]                              |in c # or any language what is/are your favo

+------------------------+-------------------------------------------------------------------------------+
|category                |input                                                                          |
+------------------------+-------------------------------------------------------------------------------+
|null                    |what is the overhead cost associated with ioc containers like structuremap     |
|null                    |mssqlserver 2008 in virtual pc                                                 |
|[.net]                  |how does one add a custom build step to an automake based project in kdevelop  |
|[c#, .net]              |how can i programmatically determine if my workstation is locked               |
|[c#, .net]              |whats the best way to start using mylyn                                        |
|[javascript]            |what is the best way to determine the number of days in a month with javascript|
|[asp.net, c#, .net]     |how do i up

+--------------------+----------------------------------------------------------------------------------------------+
|category            |input                                                                                         |
+--------------------+----------------------------------------------------------------------------------------------+
|[c#, c++]           |how do i find the type of the object instance of the caller of the current function           |
|[c#, .net]          |whats the best way to start using mylyn                                                       |
|[javascript]        |what is the best way to determine the number of days in a month with javascript               |
|[java]              |how do i skip items when tabbing without using tabindex                                       |
|[css, html, asp.net]|css and lack of constants/variables issue                                                     |
|[sql-server, sql]   |custom aggregate functions in ms s

+-----------------------------+--------------------------------------------------------------------------------+
|category                     |input                                                                           |
+-----------------------------+--------------------------------------------------------------------------------+
|null                         |what is the best way to implement a singleton pattern class in actionscript 3   |
|[.net]                       |how does one add a custom build step to an automake based project in kdevelop   |
|[c++]                        |can not execute program if using boost c++ libraries in debug version on winxp  |
|[sql-server-2005, sql, mysql]|database schema design                                                          |
|[c#, c]                      |is there an alternative for sleep in c                                          |
|[java]                       |when using ant how can i define a task only if i have some specif

+----------------------+----------------------------------------------------------------------------------------+
|category              |input                                                                                   |
+----------------------+----------------------------------------------------------------------------------------+
|[c++]                 |how to check if file is ascii or binary in c++                                          |
|null                  |clean document roles in a doc library                                                   |
|null                  |perform token replacements using vs post build event command                            |
|[python]              |need help variable creation in python continuation                                      |
|[asp.net, asp.net-mvc]|classic asp intranet and new asp net applications                                       |
|null                  |release configuration management                                

In [8]:
df_output.rdd.getNumPartitions()

20