<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#GBTRegressor" data-toc-modified-id="GBTRegressor-1">GBTRegressor</a></span><ul class="toc-item"><li><span><a href="#Load-Data" data-toc-modified-id="Load-Data-1.1">Load Data</a></span><ul class="toc-item"><li><span><a href="#GBT" data-toc-modified-id="GBT-1.1.1">GBT</a></span></li><li><span><a href="#LOGISTIC" data-toc-modified-id="LOGISTIC-1.1.2">LOGISTIC</a></span></li></ul></li></ul></li></ul></div>

In [10]:
from pyspark import SparkContext, SparkConf

# SparkML
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression, GBTClassifier, MultilayerPerceptronClassifier
from pyspark.mllib.evaluation import MulticlassMetrics


# SparkSQL
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import time
import os

In [11]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:2.7.4"  --driver-memory "4g" --executor-memory "4g" pyspark-shell'

In [12]:
#from pyspark import SparkConf
# conf = SparkConf().set("spark.driver.memory", "4g")

In [16]:
# Get the ball rolling
sc = SparkContext().getOrCreate()
ss = SparkSession.builder.getOrCreate()

In [17]:
sc._conf.set("spark.driver.memory", "6g")

<pyspark.conf.SparkConf at 0x10a137780>

In [18]:
sc._conf.get("spark.driver.memory")

'6g'

In [19]:
sc._jsc.hadoopConfiguration().set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', 'AKIAW7CYB6L5SISYOIU7')    # Access Key
sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key', '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') # Secret Key

------------------

# GBTRegressor

## Load Data

In [20]:
data_path = "s3a://msds-sparkle/data/lite/lite_619.csv"
df_pills = ss.read.csv(data_path, header=True, inferSchema=True)

In [21]:
# df_pills = ss.read.csv('./lite_629_w10.csv', header=True, inferSchema=True)

In [22]:
df_pills.groupBy("label").count().show(2)

+-----+-----+
|label|count|
+-----+-----+
|    1|  399|
|    0|  220|
+-----+-----+



In [23]:
# n_observations, n_features
df_pills.count(), len(df_pills.columns)

(619, 103)

In [26]:
# Encode pill counts less than or equal to 15 as 1, else 0
# convert_int = udf(lambda x : float(x <= 10), DoubleType())
# df_pills = df_pills.withColumn('label', convert_int('label'))

In [27]:
import pandas as pd

def cross_val(df_pills, k, pipeline, model_name):
    # get the k splits
    set_of_k = set(i for i in range(k))
    data_splits = df_pills.randomSplit(weights=[1/k for _ in set_of_k])
    # instantiate to hold f1s and accuracies
    scores = defaultdict(list)
    # outer loop to train each of the k models
    for i in set_of_k:
        # getting test dataset
        df_test = data_splits[i]
        # splits going into train dataset
        train_splits = list(set_of_k - {i})
        print(train_splits)
        # defining first train set to union rest with
        df_train = data_splits[train_splits[0]]
        # inner loop to concat the rest of the splits into the train set
        for s in train_splits[1:]:
                df_train = df_train.union(data_splits[s])
        ####  build pipeline and define models
        # train model pipeline
        start = time.time()
        pipeline_model = pipeline.fit(df_train)
        print(f"{model_name} Model {i} training time:", time.time() - start)
        #### predictions and metric
        predict = pipeline_model.transform(df_test)
        metrics = MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction")
        metrics.setMetricName("accuracy")
        accuracy = metrics.evaluate(predict)
        print(f"{model_name} Accuracy: {metrics.evaluate(predict):.4f}")
        scores["accuracies"].append(accuracy)
        metrics.setMetricName("f1")
        f1 = metrics.evaluate(predict)
        print(f"{model_name} F1: {metrics.evaluate(predict):.3f}")
        scores["f1s"].append(f1)
        
        #### Feature importances
        fitted_model = pipeline_model.stages[1]
        feat_imps_gbt = fitted_model.featureImportances
        top_tups_gbt = sorted(list(zip(feat_imps_gbt.indices, feat_imps_gbt.values)), key=lambda x: x[1], reverse=True)[:5]
        top_indices = [x for x in top_tups_gbt[:25]]
        top_features = [(df_pills.columns[index[0]], f'{index[1]:.1%}') for index in top_indices]
        top_features
        print(f"{model_name} Feature importances:\n{pd.DataFrame(top_features)}")
    return scores

In [28]:
from collections import defaultdict
from pyspark.ml import Pipeline

# Drop target="label" col from training data
train_cols = df_pills.drop("label").columns[1:-1]

# Transformer; excludes "label" col
va = VectorAssembler(outputCol="features", inputCols=train_cols)

### GBT

In [29]:
from collections import defaultdict
from pyspark.ml import Pipeline

# Estimator
gbt = GBTClassifier(maxIter=200, maxDepth=3, stepSize=0.3)

# Fit the pipeline to training documents.
gbt_pipeline = Pipeline(stages=[va, gbt])
scores = cross_val(df_pills, 5, gbt_pipeline, 'GBT')

[1, 2, 3, 4]
GBT Model 0 training time: 79.24049520492554
GBT Accuracy: 0.7863
GBT F1: 0.786
GBT Feature importances:
                           0     1
0               4_min_gyro_x  4.5%
1  2_avg_audio_average_power  3.3%
2               4_max_gyro_x  3.0%
3               1_max_gyro_x  3.0%
4     4_avg_audio_peak_power  2.8%
[0, 2, 3, 4]
GBT Model 1 training time: 58.23546814918518
GBT Accuracy: 0.7752
GBT F1: 0.775
GBT Feature importances:
                           0     1
0               1_max_gyro_x  5.7%
1               4_min_gyro_x  3.6%
2     4_min_audio_peak_power  3.6%
3               0_max_gyro_x  3.2%
4  2_avg_audio_average_power  2.6%
[0, 1, 3, 4]
GBT Model 2 training time: 57.443233013153076
GBT Accuracy: 0.8447
GBT F1: 0.844
GBT Feature importances:
                           0     1
0  2_avg_audio_average_power  3.8%
1               1_max_gyro_x  3.3%
2               4_min_gyro_x  3.2%
3     3_min_audio_peak_power  2.9%
4               4_max_gyro_x  2.9%
[0, 1, 2, 4]
GB

### LOGISTIC

In [30]:
# Estimator
lr = LogisticRegression(maxIter=1500, fitIntercept=True)

# Fit the pipeline to training documents.
lr_pipeline = Pipeline(stages=[va, lr])
scores = cross_val(df_pills, 5, lr_pipeline, 'LR')

[1, 2, 3, 4]
LR Model 0 training time: 22.835731983184814
LR Accuracy: 0.6949
LR F1: 0.693


AttributeError: 'LogisticRegressionModel' object has no attribute 'featureImportances'