# Group 3 NHL Data Analytics Project

# Predicting event based on puck location and rink side

### Load the dataset and select the appropriate fields

In [0]:
plays = spark.read.format('csv').options(header='true', inferSchema='true').load('/FileStore/tables/team_3/nhl/playss.csv').dropna()
plays_side = plays.select('x','y','event','rink_side')
display(plays_side)

x,y,event,rink_side
0,0,Faceoff,right
28,24,Giveaway,right
52,28,Blocked Shot,right
80,-3,Shot,left
-30,-38,Hit,left
-60,39,Giveaway,right
0,0,Faceoff,right
88,-4,Shot,left
81,-27,Shot,left
-64,-35,Hit,right


### Create rink side as an integer

In [0]:
from pyspark.sql.functions import *
plays_selected=plays_side.withColumn('s', when(col('rink_side') == ('left'), 0).when(col('rink_side') == ('right'), 1))
display(plays_selected)

x,y,event,rink_side,s
0,0,Faceoff,right,1
28,24,Giveaway,right,1
52,28,Blocked Shot,right,1
80,-3,Shot,left,0
-30,-38,Hit,left,0
-60,39,Giveaway,right,1
0,0,Faceoff,right,1
88,-4,Shot,left,0
81,-27,Shot,left,0
-64,-35,Hit,right,1


### drop nulls

In [0]:
playsDF=plays_selected.dropna()
playsDF.show(2)

### Count and show the number of events

In [0]:
playsDF.groupBy('event').count().show()

In [0]:
playsDF.count()

### show the schema

In [0]:

playsDF.cache()
playsDF.printSchema()

### vector assembly, create a features column with X, Y, and the integer for rink side

In [0]:
from pyspark.ml.feature import VectorAssembler

vectorAssembler = VectorAssembler(inputCols = ['x','y','s'], outputCol = 'features')

playside_df = vectorAssembler.transform(playsDF)

playside_df.select(['features', 'event']).show(10, False)

### create a label column as event

In [0]:
from pyspark.ml.feature import StringIndexer

labelIndex = StringIndexer().setInputCol('event').setOutputCol('label')

### fit the dataset and drop any nulls (again, just in case)

In [0]:
label_DF = labelIndex.fit(playside_df).transform(playside_df).dropna()

In [0]:
display(label_DF)

x,y,event,rink_side,s,features,label
0,0,Faceoff,right,1,"List(1, 3, List(), List(0.0, 0.0, 1.0))",0.0
28,24,Giveaway,right,1,"List(1, 3, List(), List(28.0, 24.0, 1.0))",5.0
52,28,Blocked Shot,right,1,"List(1, 3, List(), List(52.0, 28.0, 1.0))",3.0
80,-3,Shot,left,0,"List(1, 3, List(), List(80.0, -3.0, 0.0))",1.0
-30,-38,Hit,left,0,"List(1, 3, List(), List(-30.0, -38.0, 0.0))",2.0
-60,39,Giveaway,right,1,"List(1, 3, List(), List(-60.0, 39.0, 1.0))",5.0
0,0,Faceoff,right,1,"List(1, 3, List(), List(0.0, 0.0, 1.0))",0.0
88,-4,Shot,left,0,"List(1, 3, List(), List(88.0, -4.0, 0.0))",1.0
81,-27,Shot,left,0,"List(1, 3, List(), List(81.0, -27.0, 0.0))",1.0
-64,-35,Hit,right,1,"List(1, 3, List(), List(-64.0, -35.0, 1.0))",2.0


### split the new data into testing and training datasets

In [0]:
(atrain, atest) = label_DF.randomSplit([0.7, 0.3], seed=100)

### run logistic regression

In [0]:
from pyspark.ml.classification import *

log = LogisticRegression(featuresCol = 'features', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
log_model = log.fit(atrain)

### show the prediction

In [0]:
logPrediction=log_model.transform(atest)
logPrediction.select("label", "prediction").show(10, False)

### calculate the prediction accuracy

In [0]:
print("prediction accuracy is: ", logPrediction.where("prediction==label").count()/logPrediction.count())

# The prediction accuracy is very low at 23.2%
## Meaning puck location and rink side are not indiciative of the event that will happen

# Prediction accuracy increased from 23.07% to 23.2% when rink side was added in

# Cross Validation

In [0]:
# instantiate a logistic Regression model
from pyspark.ml.classification import LogisticRegression

lr= LogisticRegression()

 # Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.1, 2])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [5, 10])
             .build())

#define evaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()\
  .setMetricName("areaUnderROC")\
  .setRawPredictionCol("prediction")\
  .setLabelCol("label")

# Create 2-fold CrossValidator
cv_lr = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=2)

# Run cross validations. 
# this will likely take a fair amount of time because of the amount of models that we're creating and testing. 
# It takes 3 minutes to run this model.
cv_lrModel = cv_lr.fit(atrain)

In [0]:
# Use test set to measure the accuracy of our model on new data
cv_lrPrediction = cv_lrModel.transform(atest)

# cvModel uses the best model found from the Cross Validation
# Evaluate best model
evaluator.evaluate(cv_lrPrediction)

In [0]:
# Calculate accuracy

print("prediction accuracy is: ", cv_lrPrediction.where("prediction==label").count()/cv_lrPrediction.count())

# Cross Validation did NOT improve the model at all.