In [10]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, SparkSession, Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql import functions as func
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import pyspark.ml.pipeline

In [3]:
### Initialize streaming context
conf = SparkConf()\
                .setMaster("local[2]")\
                .setAppName("MobileAnalyticsNaiveBayes")\
                .set("spark.executor.memory", "2g")\
                .set("spark.driver.memory", "2g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
sqlContext = SQLContext(sc)
spark = SparkSession.builder.appName("spark play").getOrCreate()

ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=MobileAnalytics, master=local[2]) created by __init__ at <ipython-input-2-3798fda7b925>:3 

In [11]:
# Vector Assembler
- 

In [None]:
assembler = VectorAssembler(
                inputCols=[
                            "events_per_device_count",
                            "min_hour",
                            "max_hour",
                            "mon_count",
                            "tue_count",
                            "wed_count",
                            "thu_count",
                            "fri_count",
                            "sat_count",
                            "sun_count",
                            "weekend_count",
                            "weekday_count",
                            "am_count",
                            "pm_count",
                            "h0_count",
                            "h1_count",
                            "h2_count",
                            "h3_count",
                            "h4_count",
                            "h5_count",
                            "h6_count",
                            "h7_count",
                            "h8_count",
                            "h9_count",
                            "h10_count",
                            "h11_count",
                            "h12_count",
                            "h13_count",
                            "h14_count",
                            "h15_count",
                            "h16_count",
                            "h17_count",
                            "h18_count",
                            "h19_count",
                            "h20_count",
                            "h21_count",
                            "h22_count",
                            "h23_count",
                            "apps_per_device_count",
                            "apps_per_event_avg",
                            "apps_active_per_device_count",
                            "appsActivePerEvent_avg",
                            "brand_model_key"
                ],
                outputCol="features"
            )

In [None]:
# naive-bayes
- https://spark.apache.org/docs/2.2.0/ml-classification-regression.html#naive-bayes

In [None]:
nb = NaiveBayes(smoothing=1.0, modelType="multinomial")
        .setFeaturesCol("features")
        .setLabelCol("label")
        .setPredictionCol("prediction")
        .setProbabilityCol("probability")
        .setRawPredictionCol("confidence")

In [None]:
# Pipeline
- https://spark.apache.org/docs/2.2.0/ml-pipeline.html#pipeline

In [None]:
pipeline = Pipeline(stages=[assembler, nb])

In [None]:
# We use a ParamGridBuilder to construct a grid of parameters to search over.
# With 3 values for hashingTF.numFeatures and 2 values for lr.regParam,
# this grid will have 3 x 2 = 6 parameter settings for CrossValidator to choose from.
    
params = ParamGridBuilder()
            .addGrid(nb.modelType, ["multinomial"])
            .addGrid(nb.smoothing, [smoothing])
            .build()    

In [None]:
# Iterate and choose the best fit model  
cross_validator = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid, evaluator=MulticlassClassificationEvaluator)
                     .setNumFolds(5)               
                
cross_validator_model =  cross_validator.fit(train)

In [1]:
# Get and Write Output

In [None]:
output = assembler.transform(test)\
                  .select("device_id","probability")