In [None]:
## model

from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import *
import numpy as np

fdf = final.withColumn("label", when(col("DailyTemperature")> col("AverageTemperaturePerMonth"),1.0).otherwise(0.0))

indexer = StringIndexer(inputCol='Location', outputCol='LocationIndex')

assembler = VectorAssembler(inputCols = ['DailyTemperature', 'NumberOfCrimesPerDay', 'AverageCrimesPerMonth', 'AverageTemperaturePerMonth', 'LocationIndex', 'label'], outputCol= 'features')

pipe = Pipeline(stages = [indexer, assembler])
transformed = pipe.fit(fdf).transform(fdf)


train, test = transformed.randomSplit([0.7,0.3])
lr = LogisticRegression()

model = lr.fit(train)
results = model.transform(test)

print("Coefficients: ", model.coefficients)
print("Intercept: ", model.intercept)

evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")
grid = ParamGridBuilder().build()
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3 )
cv = cv.fit(train)
cv.avgMetrics

import pandas as pd

# Create a grid to hold hyperparameters
grid = ParamGridBuilder()
grid = grid.addGrid(lr.regParam, [0.0, 0.2, 0.4, 0.6, 0.8, 1.0] )
grid = grid.addGrid(lr.elasticNetParam, [0, 1])
# Build the grid
grid = grid.build()
print('Number of models to be tested: ', len(grid))
# Create the CrossValidator using the new hyperparameter grid
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator)
# Call cv.fit() to create models with all of the combinations of parameters in the grid
all_models = cv.fit(train)
print("Average Metrics for Each model: ", all_models.avgMetrics)

hyperparams = all_models.getEstimatorParamMaps()[np.argmax(all_models.avgMetrics)]
for i in range(len(hyperparams.items())):
    print([x for x in hyperparams.items()][i])
bestModel = all_models.bestModel
print("Area under ROC curve:", bestModel.summary.areaUnderROC)

test_results = bestModel.transform(test)
test_results.show()
print(evaluator.evaluate(test_results))