In [1]:
# import libs

# import necessary libs
import numpy  as np
import pandas as pd

from __future__ import division

# general spark modules
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.functions import lit


# spark ml modules 
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StandardScaler
from pyspark.ml.regression import LinearRegression, DecisionTreeRegressor, RandomForestRegressor

from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorIndexer

from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler

# classification 
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

from pyspark.ml.evaluation import BinaryClassificationEvaluator


import time
import itertools

## Load Data

In [19]:
# load data as dataframe
train_df = spark.read.csv('/projects/hue_kdd/derived/shared/talking_data/train.csv', header=True)
test_df  = spark.read.csv('/projects/hue_kdd/derived/shared/talking_data/test.csv',  header=True)

# Drop not used columns 
# train_df = train_df.drop('attributed_time')

In [20]:
# train and validation split
train_df = train_df.withColumn('day',   substring(train_df.click_time, 9, 2))\
                    .withColumn('hour', substring(train_df.click_time, 12, 2))\
                    .withColumn('min',  substring(train_df.click_time, 15, 2))\
                    .withColumn('sec',  substring(train_df.click_time, 18, 2))\
            
            
test_df  = test_df.withColumn('hour', substring(test_df.click_time, 12, 2))\
                  .withColumn('min',  substring(test_df.click_time, 15, 2))\
                  .withColumn('sec',  substring(test_df.click_time, 18, 2))\
        
# split data to train and validation properly
# first three days for training, 1 day for validation for same hours as test
tr_df  = train_df.where((train_df.day != '09'))
val_df = train_df.where((train_df.day == '09') & (train_df.hour.isin(['15', '11', '09', '05', 
                                                                      '06', '10', '04', '13', '14'])))

In [4]:
# print number of rows in each dataset
print tr_df.count()
print val_df.count()
print test_df.count()

131886953
30840433
18790469


In [21]:
main_df = tr_df

## Feature engineering 

In [None]:
## session time
## downsampling
## add MLP
## add GBT

In [22]:
def get_features_df(var_1, var_2, main_df, train_df, val_df, test_df):
    
    start_time = time.time()
    prefix = var_1 + '_' + var_2
    
    # first group by the two variables and pivot on clicked vs not clicked for their count
    feature_count_df = main_df.groupby([var_1, var_2])\
                               .pivot('is_attributed').count()\
                               .alias(prefix+'_count').fillna(0)\
    
    # rename the columns of clicked vs not clicked
    feature_count_df = feature_count_df.withColumnRenamed('1', prefix + '_attributed')\
                                       .withColumnRenamed('0', prefix + '_not_attributed')
    
    # calculate ratio of click vs not clicked
    feature_count_df = feature_count_df.withColumn(prefix+'_click_ratio', col(prefix+'_attributed') \
                                        / (col(prefix+'_attributed')+col(prefix+'_not_attributed')))
    
    # create a join column
    feature_count_df = feature_count_df.withColumn('separator', lit('_')) \
                                       .withColumn(prefix+'_join_id', 
                                                   concat(var_1, 'separator', var_2))
    feature_count_df = feature_count_df.drop('separator')
    
    # drop columns
    feature_count_df = feature_count_df.select(prefix+'_join_id', 
                                               prefix+'_attributed', 
                                               prefix+'_not_attributed',
                                               prefix+'_click_ratio')
    
    # join with main df
    train_df_featured = train_df.withColumn('separator', lit('_'))\
                                .withColumn(prefix+'_join_id', concat(var_1, 'separator', var_2))\
                                .join(feature_count_df, on = prefix+'_join_id', how='leftouter')
            
    val_df_featured   = val_df.withColumn('separator', lit('_'))\
                                .withColumn(prefix+'_join_id', concat(var_1, 'separator', var_2))\
                                .join(feature_count_df, on = prefix+'_join_id', how='leftouter')

    test_df_featured  = test_df.withColumn('separator', lit('_'))\
                               .withColumn(prefix+'_join_id', concat(var_1, 'separator', var_2))\
                               .join(feature_count_df, on = prefix+'_join_id', how='leftouter')
            
    # drop join column
    train_df_featured = train_df_featured.drop(prefix+'_join_id')
    train_df_featured = train_df_featured.drop('separator')
    
    val_df_featured   = val_df_featured.drop(prefix+'_join_id')
    val_df_featured   = val_df_featured.drop('separator')
    
    test_df_featured  = test_df_featured.drop(prefix+'_join_id')
    test_df_featured  = test_df_featured.drop('separator')
    
    # fill missing values with 0
    train_df_featured   = train_df_featured.fillna(0)
    val_df_featured     = val_df_featured.fillna(0)
    test_df_featured    = test_df_featured.fillna(0)
    
    print "--- %s seconds ---" % (time.time() - start_time)
    print 
    
    return train_df_featured, val_df_featured, test_df_featured

In [7]:
def get_unique_features_per_ip(main_df, train_df, val_df, test_df):
    
    start_time = time.time()
    features_df = main_df.groupBy(col('ip')).agg(collect_set('os').alias('os_unique'),
                                collect_set('app').alias('app_unique'),
                                collect_set('channel').alias('channel_unique'),
                                collect_set('device').alias('device_unique'),
                                collect_list('click_time').alias('clicks'),
                                collect_list('attributed_time').alias('download_clicks'))
    
    features_df = features_df.withColumn('os_unique_count',       size('os_unique'))\
                             .withColumn('app_unique_count',      size('app_unique'))\
                             .withColumn('channel_unique_count',  size('channel_unique'))\
                             .withColumn('device_unique_count',   size('device_unique'))\
                             .withColumn('clicks_count',          size('clicks'))\
                             .withColumn('download_clicks',       size('download_clicks'))\
                             .withColumn('download_rate',        col('download_clicks')/col('clicks_count'))
     
    features_df = features_df.select('ip', 'os_unique_count', 'app_unique_count', 'channel_unique_count',
                                     'device_unique_count', 'clicks_count', 'download_clicks', 'download_rate') 
                            
    # join with main df
    train_df_featured = train_df.join(features_df, on = 'ip', how='leftouter')
    val_df_featured   = val_df.join(features_df,   on = 'ip', how='leftouter')
    test_df_featured  = test_df.join(features_df,  on = 'ip', how='leftouter')
    
    # fill missing values with 0
    train_df_featured   = train_df_featured.fillna(0)
    val_df_featured     = val_df_featured.fillna(0)
    test_df_featured    = test_df_featured.fillna(0)
    
    print "--- %s seconds ---" % (time.time() - start_time)
    print 
    
    return train_df_featured, val_df_featured, test_df_featured


In [23]:
# get unique features for ip
tr_df, val_df, test_df = get_unique_features_per_ip(main_df, tr_df, val_df, test_df)

--- 0.189445018768 seconds ---



In [24]:
# learning with counts

# chanell
var_1      = 'channel'
var_2_list = ['app', 'device', 'os', 'ip', 'hour']

for var_2 in var_2_list:
    tr_df, val_df, test_df = get_features_df(var_1, var_2, main_df, tr_df, val_df, test_df)
    
# ip
var_1      = 'ip'
var_2_list = ['app', 'device', 'os', 'hour']

for var_2 in var_2_list:
    tr_df, val_df, test_df = get_features_df(var_1, var_2, main_df, tr_df, val_df, test_df)
    
# app
var_1      = 'app'
var_2_list = ['device', 'os', 'hour']

for var_2 in var_2_list:
    tr_df, val_df, test_df = get_features_df(var_1, var_2, main_df, tr_df, val_df, test_df)

--- 8.97741413116 seconds ---

--- 11.644203186 seconds ---

--- 11.7373659611 seconds ---

--- 6.65888690948 seconds ---

--- 10.8854689598 seconds ---

--- 10.0641140938 seconds ---

--- 7.20223808289 seconds ---

--- 7.70186305046 seconds ---

--- 7.24928689003 seconds ---

--- 10.2056500912 seconds ---

--- 7.22589707375 seconds ---

--- 10.9927330017 seconds ---



In [25]:
tr_df    = tr_df.drop(  'ip',    'click_time', 'attributed_time', 'day', 'min', 'sec')
val_df   = val_df.drop( 'ip',   'click_time', 'attributed_time', 'day', 'min', 'sec')
test_df  = test_df.drop('ip',  'click_time', 'attributed_time', 'day', 'min', 'sec')

In [26]:
# prepare data for models

# Write a custom function to convert the data type of DataFrame columns# Write 
def convertColumn(df, names, newType):
    for name in names: 
        df = df.withColumn(name, df[name].cast(newType))
    return df 

In [27]:
# cast numerical columns to float
numerical_cols = tr_df.columns[6:]
label_col = ['is_attributed']
tr_df   = convertColumn(tr_df,   numerical_cols + label_col, FloatType())
val_df  = convertColumn(val_df,  numerical_cols + label_col, FloatType())
test_df = convertColumn(test_df, numerical_cols, FloatType())

In [28]:
# fill missing values with 0
tr_df   = tr_df.fillna(0)
val_df  = val_df.fillna(0)
test_df = test_df.fillna(0)

In [29]:
stages = []

numerical_cols   = tr_df.columns[6:]
categorical_cols = ['hour'] 
#                     'app', 'device', 'os', 'channel']

for categorical_col in categorical_cols:
    string_indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + "_index")
    stages += [string_indexer]

assembler_inputs = numerical_cols + [c + "_index" for c in categorical_cols]
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

stages  += [assembler]

## Model Selection

In [30]:
# Create a Pipeline.
pipeline = Pipeline(stages=stages)
# Run the feature transformations.
#  - fit() computes feature statistics as needed.
#  - transform() actually transforms the features.
pipelineModel = pipeline.fit(tr_df)
train         = pipelineModel.transform(tr_df)
val           = pipelineModel.transform(val_df)
test          = pipelineModel.transform(test_df)

In [33]:
train = train.repartition(100)
train = train.persist()

### Logistic regression 

In [36]:
# define parameters
regParam        = [0.1, 0.5, 2.0]
elasticNetParam = [0.0,  0.5, 1.0]
maxIter         = [10, 50, 100]
experiments     = list(itertools.product(regParam, elasticNetParam, maxIter))
print len(experiments)

27


In [50]:
for ind, experiment in enumerate(experiments):
    regParam        = experiment[0]
    elasticNetParam = experiment[1]
    maxIter         = experiment[2]
    

    start_time = time.time()
    print ind
    print 'params: ', regParam, elasticNetParam, maxIter
    
    lr = LogisticRegression(labelCol="is_attributed", 
                            featuresCol="features", 
                            regParam=regParam,
                            elasticNetParam=elasticNetParam,
                            maxIter=maxIter
                            )
    
    # Train model with Training Data
    lrModel     = lr.fit(train)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = lrModel.transform(val)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='is_attributed')
    auc       = evaluator.evaluate(predictions)
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 
    

In [None]:
0
params:  0.1 0.0 10
AUC:  0.954670766323
--- 441.857987881 seconds ---

1
params:  0.1 0.0 50
AUC:  0.955339694913
--- 299.4097929 seconds ---

2
params:  0.1 0.0 100
AUC:  0.955339694913
--- 297.301779032 seconds ---

3
params:  0.1 0.5 10
AUC:  0.5
--- 245.735084057 seconds ---

4
params:  0.1 0.5 50
AUC:  0.5
--- 233.575064182 seconds ---

5
params:  0.1 0.5 100
AUC:  0.5
--- 241.358119011 seconds ---

6
params:  0.1 1.0 10
AUC:  0.5
--- 223.717324018 seconds ---

7
params:  0.1 1.0 50
AUC:  0.5
--- 224.282985926 seconds ---

In [54]:
lr = LogisticRegression(labelCol="is_attributed", 
                        featuresCol="features", 
                        regParam=0.1,
                        elasticNetParam=0,
                        maxIter=100
                        )

In [56]:
# Train model with Training Data
lrModel     = lr.fit(train)

In [57]:
predictions = lrModel.transform(val)

In [58]:
# evaluate predictions
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='is_attributed')
auc       = evaluator.evaluate(predictions)
print 'AUC: ', auc
print "--- %s seconds ---" % (time.time() - start_time)
print 

AUC:  0.955339694913
--- 791.741017103 seconds ---



In [59]:
predictions = lrModel.transform(test)

In [61]:
predictions.select('click_id', 'rawPrediction', 'probability', 'prediction').show(4)

+--------+--------------------+--------------------+----------+
|click_id|       rawPrediction|         probability|prediction|
+--------+--------------------+--------------------+----------+
|13184596|[6.56511485417077...|[0.99859331938684...|       0.0|
|15512127|[6.60834346139872...|[0.99865275244681...|       0.0|
|13958723|[6.44647258453791...|[0.99841640458045...|       0.0|
|14430045|[6.54686916884697...|[0.99856745506389...|       0.0|
+--------+--------------------+--------------------+----------+
only showing top 4 rows



In [66]:
predictions.select('click_id', 'prediction').withColumn('is_attributed', col('prediction')).coalesce(1).write.csv('mycsv2.csv')

## Decision Tree Classifier

In [46]:
# define parameters
maxDepth = [15, 30]
maxBins  = [60, 80]
experiments     = list(itertools.product(maxDepth, maxBins))
print len(experiments)

4


In [47]:
for ind, experiment in enumerate(experiments):
    maxDepth = experiment[0]
    maxBins  = experiment[1]

    start_time = time.time()
    print ind
    print 'params: ', maxDepth, maxBins
    
    # Create initial Decision Tree Model
    dt = DecisionTreeClassifier(labelCol="is_attributed", featuresCol="features", maxDepth=maxDepth, maxBins=maxBins)
    
    # Train model with Training Data
    dtModel = dt.fit(train)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = dtModel.transform(val)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='is_attributed')
    auc       = evaluator.evaluate(predictions)
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 

In [None]:
0
params:  15 60
AUC:  0.513338548522
--- 269.140621185 seconds ---

1
params:  15 80
AUC:  0.515574417785
--- 227.470299959 seconds ---

2
params:  30 60
AUC:  0.515309944869
--- 397.263484001 seconds ---

### Random Forest 

In [51]:
# define parameters
numTrees         = [5, 10, 50, 100]
subsamplingRate  = [0.8]
maxDepth         = [10, 15]
experiments      = list(itertools.product(numTrees, maxDepth, subsamplingRate))
print len(experiments)

8


In [53]:
for ind, experiment in enumerate(experiments):
    numTrees = experiment[0]
    maxDepth = experiment[1]
    subsamplingRate = experiment[2]

    start_time = time.time()
    print ind
    print 'params: ', numTrees, maxDepth, subsamplingRate
    
    # Create an initial RandomForest model.
    rf = RandomForestClassifier(labelCol="is_attributed", featuresCol="features", 
                                numTrees=numTrees, 
                                maxDepth=maxDepth,
                                subsamplingRate=subsamplingRate)
    
    # Train model with Training Data
    rfModel = rf.fit(train)
    
    # Make predictions on validation data using the transform() method.
    # LogisticRegression.transform() will only use the 'features' column.
    predictions = rfModel.transform(val)
    
    # evaluate predictions
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol='is_attributed')
    auc       = evaluator.evaluate(predictions)
    print 'AUC: ', auc
    print "--- %s seconds ---" % (time.time() - start_time)
    print 

In [None]:
0
params:  5 10 0.8
AUC:  0.524119692289
--- 278.628679991 seconds ---

1
params:  5 15 0.8
AUC:  0.513582253066
--- 434.034000158 seconds ---

2
params:  10 10 0.8
AUC:  0.509167695992
--- 450.640409946 seconds ---