In [1]:
from pydataset import data # gets the 'tips' dataset
import pyspark
import pyspark.ml # the machine learning module within pyspark
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(data('tips'))

train, test = df.randomSplit([0.8, 0.2], seed=123)

In [2]:
def shape(df: pyspark.sql.DataFrame):
    return df.count(), len(df.columns) # function to define a shape function 

# spark doesn't have a built-in function for shape that pandas does
# b/c spark is lazy and doesn't assemble the data in any order - it's all broken up

In [3]:
shape(train)

(193, 7)

In [4]:
shape(test)

(51, 7)

## Regression

- We'll first demonstrate a regression problem: predicting the tip amount.

- Spark also has functions for time-series and NLP

In [5]:
train.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     12.54| 2.5|  Male|    No|Sun|Dinner|   2|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     15.77|2.23|Female|    No|Sat|Dinner|   2|
|     16.04|2.24|  Male|    No|Sat|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinner|   3|
|     16.31| 2.0|  Male|    No|Sat|Dinner|   3|
|     16.93|3.07|Female|    No|Sat|Dinner|   3|
|     16.97| 3.5|Female|    No|Sun|Dinne

pyspark.ml.feature.RFormula

- RFormula is a ml feature in pyspark that describes how we're going to make predictions, but was written for 'R' programming language

- tip ~ total_bill: predict tip based on total bill
    
- tip ~ total_bill + size: predict tip based on total bill and size
    
- tip ~ .: predict tip based on all the other features in the dataset
    

In [6]:
# nb: spark's rformula does encoding
rf = pyspark.ml.feature.RFormula(formula="tip ~ total_bill + size").fit(train)
# read as 'the 'tip' is a function of the 'total_bill' and 'size'

rf.transform(train).show(5)

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  2.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|[10.27,2.0]| 1.71|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2|[10.29,2.0]|  2.6|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|[10.33,3.0]| 1.67|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,3.0]| 1.66|
+----------+----+------+------+---+------+----+-----------+-----+
only showing top 5 rows



- 'features' and 'label' columns are the shape/name required for pyspark.ml

- 'features' and 'label' were columns added by the RFormula

In [7]:
train_input = rf.transform(train).select('features', 'label')
train_input.show(5)

+-----------+-----+
|   features|label|
+-----------+-----+
| [8.77,2.0]|  2.0|
|[10.27,2.0]| 1.71|
|[10.29,2.0]|  2.6|
|[10.33,3.0]| 1.67|
|[10.34,3.0]| 1.66|
+-----------+-----+
only showing top 5 rows



**Create, fit, and use the model**

**NB:
Unlike sklearn, each step produces a new object!**

In [8]:
lr = pyspark.ml.regression.LinearRegression() #this is where I'd set my hyperparameters
lr

LinearRegression_b55671c562a0

In [9]:
# print(lr.explainParams())

In [10]:
lr_fit = lr.fit(train_input)
lr_fit.transform(train_input).show(5)

# unlike in pandas, all the ols and y-hat stuff is done for us in spark

+-----------+-----+------------------+
|   features|label|        prediction|
+-----------+-----+------------------+
| [8.77,2.0]|  2.0|1.8431748565237749|
|[10.27,2.0]| 1.71|1.9846902983830235|
|[10.29,2.0]|  2.6|1.9865771709411468|
|[10.33,3.0]| 1.67|2.1519321354884804|
|[10.34,3.0]| 1.66|2.1528755717675416|
+-----------+-----+------------------+
only showing top 5 rows



**Training Results:**

In [11]:
lr_fit.summary

<pyspark.ml.regression.LinearRegressionTrainingSummary at 0x11c0595d0>

In [12]:
lr_fit.summary.r2, lr_fit.summary.rootMeanSquaredError

(0.5083641431094578, 0.9254138907986432)

In [13]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')] # shows me all the stuff I can use

['coefficientStandardErrors',
 'degreesOfFreedom',
 'devianceResiduals',
 'explainedVariance',
 'featuresCol',
 'labelCol',
 'meanAbsoluteError',
 'meanSquaredError',
 'numInstances',
 'objectiveHistory',
 'pValues',
 'predictionCol',
 'predictions',
 'r2',
 'r2adj',
 'residuals',
 'rootMeanSquaredError',
 'tValues',
 'totalIterations']

### How did we do on the test data?

In [14]:
test_input = rf.transform(test)
lr_fit.transform(test_input).show(4)

+----------+----+----+------+---+------+----+-----------+-----+------------------+
|total_bill| tip| sex|smoker|day|  time|size|   features|label|        prediction|
+----------+----+----+------+---+------+----+-----------+-----+------------------+
|      9.55|1.45|Male|    No|Sat|Dinner|   2| [9.55,2.0]| 1.45|1.9167628862905841|
|      9.68|1.32|Male|    No|Sun|Dinner|   2| [9.68,2.0]| 1.32|1.9290275579183855|
|      9.94|1.56|Male|    No|Sun|Dinner|   2| [9.94,2.0]| 1.56|1.9535569011739888|
|     11.24|1.76|Male|   Yes|Sat|Dinner|   2|[11.24,2.0]| 1.76| 2.076203617452004|
+----------+----+----+------+---+------+----+-----------+-----+------------------+
only showing top 4 rows



In [15]:
evaluator = pyspark.ml.evaluation.RegressionEvaluator()
rmse = evaluator.evaluate(lr_fit.transform(test_input))
rmse

# this is the root mean square error on the test dataset

1.272225453071824

### Classification

- we're trying to predict 'time'

- Preprocess the training data

In [16]:
rf = pyspark.ml.feature.RFormula(formula='time ~ total_bill + size').fit(train)
train_input = rf.transform(train)
train_input.show(50)

# dinner's going to be 0
# lunch is going to be 1

+----------+----+------+------+---+------+----+-----------+-----+
|total_bill| tip|   sex|smoker|day|  time|size|   features|label|
+----------+----+------+------+---+------+----+-----------+-----+
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| [8.77,2.0]|  0.0|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|[10.27,2.0]|  0.0|
|     10.29| 2.6|Female|    No|Sun|Dinner|   2|[10.29,2.0]|  0.0|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|[10.33,3.0]|  0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|[10.34,3.0]|  0.0|
|     12.54| 2.5|  Male|    No|Sun|Dinner|   2|[12.54,2.0]|  0.0|
|     13.37| 2.0|  Male|    No|Sat|Dinner|   2|[13.37,2.0]|  0.0|
|     13.94|3.06|  Male|    No|Sun|Dinner|   2|[13.94,2.0]|  0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|[14.78,2.0]|  0.0|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|[14.83,2.0]|  0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|[15.04,2.0]|  0.0|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|[15.42,2.0]|  0.0|
|     15.7

In [17]:
lr = pyspark.ml.classification.LogisticRegression()

In [18]:
# print(lr.explainParams())

In [19]:
lr_fit = lr.fit(train_input)

**Model Evaluation**

In [20]:
[x for x in dir(lr_fit.summary) if not x.startswith('_')]

['accuracy',
 'areaUnderROC',
 'fMeasureByLabel',
 'fMeasureByThreshold',
 'falsePositiveRateByLabel',
 'featuresCol',
 'labelCol',
 'labels',
 'objectiveHistory',
 'pr',
 'precisionByLabel',
 'precisionByThreshold',
 'predictionCol',
 'predictions',
 'probabilityCol',
 'recallByLabel',
 'recallByThreshold',
 'roc',
 'totalIterations',
 'truePositiveRateByLabel',
 'weightedFMeasure',
 'weightedFalsePositiveRate',
 'weightedPrecision',
 'weightedRecall',
 'weightedTruePositiveRate']

**Notes on area under the ROC Curve:**
    
- produce a curve where each point on the curve is the True Positive versus the False Positive; multiple points are found by adjusting the threshold for the model

- Works for classification models and other models that predict probability in addition to yes / no

- Gives us a number between 0 and 1; the closer to 1, the better

- When I hear 'AOC,' that's short for the 'Area Under the Curve.'

In [21]:
# area under TPR (recall) vs FPR (FP / (FP + TN)) curve
# https://en.wikipedia.org/wiki/Receiver_operating_characteristic
lr_fit.summary.areaUnderROC

# how we did on the test data|

0.6430830039525691

In [22]:
evaluator = pyspark.ml.evaluation.BinaryClassificationEvaluator()
test_auc = evaluator.evaluate(lr_fit.transform(rf.transform(test)))
test_auc

0.5809716599190283

In [23]:
test_input = rf.transform(test)
test_input.show()

+----------+----+------+------+----+------+----+-----------+-----+
|total_bill| tip|   sex|smoker| day|  time|size|   features|label|
+----------+----+------+------+----+------+----+-----------+-----+
|      9.55|1.45|  Male|    No| Sat|Dinner|   2| [9.55,2.0]|  0.0|
|      9.68|1.32|  Male|    No| Sun|Dinner|   2| [9.68,2.0]|  0.0|
|      9.94|1.56|  Male|    No| Sun|Dinner|   2| [9.94,2.0]|  0.0|
|     11.24|1.76|  Male|   Yes| Sat|Dinner|   2|[11.24,2.0]|  0.0|
|     12.69| 2.0|  Male|    No| Sat|Dinner|   2|[12.69,2.0]|  0.0|
|     15.06| 3.0|Female|    No| Sat|Dinner|   2|[15.06,2.0]|  0.0|
|     16.99|1.01|Female|    No| Sun|Dinner|   2|[16.99,2.0]|  0.0|
|     17.81|2.34|  Male|    No| Sat|Dinner|   4|[17.81,4.0]|  0.0|
|     18.04| 3.0|  Male|    No| Sun|Dinner|   2|[18.04,2.0]|  0.0|
|     18.43| 3.0|  Male|    No| Sun|Dinner|   4|[18.43,4.0]|  0.0|
|     21.58|3.92|  Male|    No| Sun|Dinner|   2|[21.58,2.0]|  0.0|
|     25.29|4.71|  Male|    No| Sun|Dinner|   4|[25.29,4.0]|  

In [24]:
# confusion matrix for the test data
(lr_fit.transform(test_input)
 .select('time', 'total_bill', 'size', 'label', 'probability', 'prediction')
 .groupby('prediction') # predicted == rows;
 .pivot('label') # actual values are columns
 .count()
 .show())

+----------+---+---+
|prediction|0.0|1.0|
+----------+---+---+
|       0.0| 38| 13|
+----------+---+---+



In [25]:
# Many other preprocessing steps
dir(pyspark.ml.feature) # shows us all the stuff we can do in this library

['Binarizer',
 'BucketedRandomProjectionLSH',
 'BucketedRandomProjectionLSHModel',
 'Bucketizer',
 'ChiSqSelector',
 'ChiSqSelectorModel',
 'CountVectorizer',
 'CountVectorizerModel',
 'DCT',
 'DecisionTreeParams',
 'ElementwiseProduct',
 'FeatureHasher',
 'HasAggregationDepth',
 'HasCheckpointInterval',
 'HasCollectSubModels',
 'HasDistanceMeasure',
 'HasElasticNetParam',
 'HasFeaturesCol',
 'HasFitIntercept',
 'HasHandleInvalid',
 'HasInputCol',
 'HasInputCols',
 'HasLabelCol',
 'HasLoss',
 'HasMaxIter',
 'HasNumFeatures',
 'HasOutputCol',
 'HasOutputCols',
 'HasParallelism',
 'HasPredictionCol',
 'HasProbabilityCol',
 'HasRawPredictionCol',
 'HasRegParam',
 'HasSeed',
 'HasSolver',
 'HasStandardization',
 'HasStepSize',
 'HasThreshold',
 'HasThresholds',
 'HasTol',
 'HasVarianceCol',
 'HasWeightCol',
 'HashingTF',
 'IDF',
 'IDFModel',
 'Imputer',
 'ImputerModel',
 'IndexToString',
 'JavaEstimator',
 'JavaMLReadable',
 'JavaMLWritable',
 'JavaModel',
 'JavaParams',
 'JavaTransformer'

# Curriculum - 