In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.sql import Row
import fasttext

In [2]:
spark = SparkSession.builder.master("local[4]").appName('mysession').getOrCreate()

## Build Spark DataFrame

In [3]:
df_input = spark.read.parquet('../data/input.parquet').repartition(8)

# Approach 3: RDD's mapPartitions

In [4]:
def fn_partition(iterator,multi_prediction=False):
    
    def get_predictions(sentence, threshold=0.10, k=3):
        """
        Note: This is the same function as in classifier.py module!
        """
        labels, probs = model.predict(sentence.lower(), k=k)
        output = list(map(lambda y: y[0].replace("__label__", ""), filter(lambda x: x[1] >= threshold, zip(labels, probs))))
        if len(output) == 0:
            return None
        else:
            return output[0] if k == 1 else output
    
    model = fasttext.load_model('../models/ft_tuned.ftz')
    
    for record in iterator:
        if not multi_prediction:
            yield Row(category=get_predictions(record['input'],k=1),input=record['input'])
        else: 
            yield Row(category=get_predictions(record['input'],k=3),input=record['input'])

## Single prediction

In [5]:
df_output = df_input.rdd.mapPartitions(lambda partition: fn_partition(partition,False)).toDF()
df_output.sample(False,.10,12345).show(10,False)

+--------+---------------------------------------------------------------------------------------------+
|category|input                                                                                        |
+--------+---------------------------------------------------------------------------------------------+
|null    |what deployment directories do you use for rails applications deploying to a debian box      |
|sql     |sql query order by                                                                           |
|c++     |c++ reading from a file blocks any further writing why                                       |
|null    |how much does it cost to develop an iphone application                                       |
|null    |what are some excellent examples of user sign up forms on the web                            |
|.net    |why doesn t backcolor work for tabcontrols in net                                            |
|c#      |c # compiler and caching of local variables  

## Multiple prediction

In [6]:
df_output = df_input.rdd.mapPartitions(lambda partition: fn_partition(partition,True)).toDF()
df_output.sample(False,.10,12345).show(10,False)

+-------------------+---------------------------------------------------------------------------------------------+
|category           |input                                                                                        |
+-------------------+---------------------------------------------------------------------------------------------+
|null               |what deployment directories do you use for rails applications deploying to a debian box      |
|[sql, sql-server]  |sql query order by                                                                           |
|[c++]              |c++ reading from a file blocks any further writing why                                       |
|null               |how much does it cost to develop an iphone application                                       |
|null               |what are some excellent examples of user sign up forms on the web                            |
|[.net, c#, asp.net]|why doesn t backcolor work for tabcontrols in net  

# Performance

In [7]:
%timeit -n 10 df_output.sample(False,.10).show(10)

+------------------+--------------------+
|          category|               input|
+------------------+--------------------+
|            [.net]|what s the best w...|
|              null|daemon threads ex...|
|              null|compact framework...|
|              null|is there an idiom...|
|              [c#]|how can i convert...|
|[java, javascript]|which javascript ...|
|              [c#]|how to walk the m...|
|             [php]|numbering regex s...|
|   [java, eclipse]|how do you run an...|
|        [c#, .net]|whats the best wa...|
+------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|                [c#]|how would you att...|
|                null|cross platform ed...|
|     [visual-studio]|how to script vis...|
|           [asp.net]|prevent long word...|
| [c#, .net, asp.net]|add multiple user...|
| [.net, c#, asp.net]|implementing

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|                [c#]|how would you att...|
|                null|what deployment d...|
|                null|compact framework...|
|                [c#]|find a private fi...|
|                null|what is the aspne...|
|                null|is there an idiom...|
|                null|what are some exc...|
|                [c#]|how can i convert...|
|                null|windows domain ch...|
|[css, asp.net, html]|abstraction away ...|
+--------------------+--------------------+
only showing top 10 rows

+----------+--------------------+
|  category|               input|
+----------+--------------------+
|     [c++]|are incrementers ...|
|[java, c#]|deterministic dis...|
|      null|daemon threads ex...|
|  [c++, c]|initialize a cons...|
|      null|is there an idiom...|
|     [c++]|c++ reading from ...|
|      [c#]|  using lists in c #|
|    [java]|embedd

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|               [c++]|are incrementers ...|
|        [c, c#, c++]|wrapping visual c...|
|             [mysql]|how do you connec...|
|                null|how do i open off...|
|                null|running multiple ...|
|                [c#]|deployment of cus...|
|              [java]|why do people use...|
|[ruby, ruby-on-ra...|what are the limi...|
|                null|how do you quickl...|
|               [php]|virtual 360Ã¢Âº s...|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|          [java, c#]|deterministic dis...|
|                null|ddd and asynchron...|
|[mysql, sql, data...|best update metho...|
|               [c++]|the necessity of ...|
|[sql, sql-server,...|sql query count w...|
|     

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|             [mysql]|how do you connec...|
|                [c#]|find a private fi...|
|                null|running multiple ...|
|           [asp.net]|get performance c...|
|                null|how to effectivel...|
|    [c++, c, python]|why learn perl py...|
|[sql, sql-server,...|sql query count w...|
|     [visual-studio]|visual studio fai...|
|            [python]|how can i execute...|
|                null|any good collecti...|
+--------------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|              [.net]|what s the best w...|
| [python, c++, java]|extracting text f...|
|   [sql-server, sql]|sql server profil...|
|                null|what are some exc...|
|     [visual-studio]|visual studio fai...|
|[ruby

+---------------+--------------------+
|       category|               input|
+---------------+--------------------+
|   [c, c#, c++]|wrapping visual c...|
|         [java]|trim whitespace f...|
|      [asp.net]|get performance c...|
|[.net, php, c#]|authoritative sou...|
|           null|windows domain ch...|
|           null|what can cause in...|
|          [php]|virtual 360Ã¢Âº s...|
|     [c#, .net]|is it possible to...|
|           null|can i use css in ...|
|          [sql]|normalizing a tab...|
+---------------+--------------------+
only showing top 10 rows

+--------------------+--------------------+
|            category|               input|
+--------------------+--------------------+
|                [c#]|how do you manage...|
|     [visual-studio]|how to disable vi...|
|                null|what deployment d...|
|              [java]|trim whitespace f...|
|                null|what is the aspne...|
|                [c#]|download files to...|
|                null|apache/tom

In [8]:
df_output.rdd.getNumPartitions()

8