In [1]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

In [2]:
def preprocess(df, stringCols, catCols, featureCols, labelCol):

    newdf = df
    
    #converting strings to numeric values
    for c in stringCols:
        #For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-num" 
        #and then drops the original columns.
        #and drop the "-num" suffix. 
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)

    #one hot encoding categorical data    
    for c in catCols:
        #For each given colum, create OneHotEncoder. 
        #dropLast : Whether to drop the last category in the encoded vector (default: true)
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        #Creates a DataFame by putting the transformed values in the new colum with suffix "-onehot" 
        #and then drops the original columns.
        #and drop the "-onehot" suffix. 
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)

    # Merging the data with Vector Assembler.
    va = VectorAssembler(outputCol="features", inputCols=featureCols)
    newdf = va.transform(newdf).select("features", labelCol).withColumnRenamed(labelCol, "label")
    
    return newdf

In [3]:
stringCols = ['app_category','app_domain','app_id',
              'device_id','device_ip','device_model',
              'site_category','site_domain','site_id']
catCols = ['C1','C14','C15','C16','C17','C18','C19','C20','C21',
           'banner_pos','device_connect_type','device_type']
featureCols = ['app_category','app_domain','app_id',
              'device_id','device_ip','device_model',
              'site_category','site_domain','site_id',
               'C1','C14','C15','C16','C17','C18','C19','C20','C21',
               'banner_pos','device_connect_type','device_type','hour']
labelCol = 'click'

In [None]:
data = spark.read.format("com.mongodb.spark.sql.DefaultSource")\
            .option("uri","mongodb://34.210.118.215/gameofspark.train").load()

In [None]:
trainDF = preprocess(data, stringCols, catCols, featureCols, labelCol)

In [None]:
#Divide the dataset into training and testing sets.
splits = trainDF.randomSplit([0.8, 0.2])

#cache() : the algorithm is interative and training and data sets are going to be reused many times.
train = splits[0].cache()
test = splits[1].cache()