In [None]:
import pyspark
import os
import sys
from pyspark import SparkContext

# Setting the environment variables to ensure PySpark uses the correct Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

from pyspark.sql import SparkSession

# Initializing a Spark session with a specified amount of driver memory and a name for the app
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('chapter_4').getOrCreate()

# Reading data from a CSV file without headers and inferring the data schema
data_without_header = spark.read.option("inferSchema", True).option("header", False).csv("data/covtype.data")
data_without_header.printSchema()

from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col

# Renaming the columns appropriately for the dataset and casting the target variable to DoubleType
colnames = ["Elevation", "Aspect", "Slope", "Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology", "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm", "Horizontal_Distance_To_Fire_Points"] + [f"Wilderness_Area_{i}" for i in range(4)] + [f"Soil_Type_{i}" for i in range(40)] + ["Cover_Type"]
data = data_without_header.toDF(*colnames).withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))
data.head()

# Splitting the data into training and test sets with a 90/10 split
(train_data, test_data) = data.randomSplit([0.9, 0.1])

# Caching the datasets to improve performance during training and testing phases
train_data.cache()
test_data.cache()

from pyspark.ml.feature import VectorAssembler

# Assembling a feature vector from all input columns except the target for use in machine learning algorithms
input_cols = colnames[:-1]
vector_assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
assembled_train_data = vector_assembler.transform(train_data)
assembled_train_data.select("featureVector").show(truncate=False)

from pyspark.ml.classification import DecisionTreeClassifier

# Initializing and training a decision tree classifier using the assembled feature vectors
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
model = classifier.fit(assembled_train_data)

# Outputting the tree structure of the trained model
print(model.toDebugString)

import pandas as pd

# Displaying feature importance as a DataFrame and sorting it in descending order
pd.DataFrame(model.featureImportances.toArray(), index=input_cols, columns=['importance']).sort_values(by="importance", ascending=False)

# Applying the model to the training data to generate predictions
predictions = model.transform(assembled_train_data)
predictions.select("Cover_Type", "prediction", "probability").show(10, truncate=False)

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluating model performance using accuracy and F1 score metrics
evaluator = MulticlassClassificationEvaluator(labelCol="Cover_Type", predictionCol="prediction")
evaluator.setMetricName("accuracy").evaluate(predictions)
evaluator.setMetricName("f1").evaluate(predictions)

# Creating a confusion matrix from the predictions
confusion_matrix = predictions.groupBy("Cover_Type").pivot("prediction", range(1,8)).count().na.fill(0.0).orderBy("Cover_Type")
confusion_matrix.show()

from pyspark.sql import DataFrame

# Function to calculate the proportion of each class in the dataset
def class_probabilities(data):
    total = data.count()
    return data.groupBy("Cover_Type").count().orderBy("Cover_Type").select(col("count").cast(DoubleType())).withColumn("count_proportion", col("count")/total).select("count_proportion").collect()

# Calculating class probabilities for training and testing data to use for model evaluation
train_prior_probabilities = class_probabilities(train_data)
test_prior_probabilities = class_probabilities(test_data)
train_prior_probabilities = [p[0] for p in train_prior_probabilities]
test_prior_probabilities = [p[0] for p in test_prior_probabilities]

# Calculating the sum of products of probabilities, which can be used to gauge model effectiveness across data splits
sum([train_p * cv_p for train_p, cv_p in zip(train_prior_probabilities, test_prior_probabilities)])

from pyspark.ml import Pipeline

# Setting up a pipeline with vector assembler and classifier for streamlined processing
assembler = VectorAssembler(inputCols=input_cols, outputCol="featureVector")
classifier = DecisionTreeClassifier(seed=1234, labelCol="Cover_Type", featuresCol="featureVector", predictionCol="prediction")
pipeline = Pipeline(stages=[assembler, classifier])

from pyspark.ml.tuning import ParamGridBuilder

# Building a parameter grid for model tuning to optimize decision tree parameters
paramGrid = ParamGridBuilder() \
    .addGrid(classifier.impurity, ["gini", "entropy"]) \
    .addGrid(classifier.maxDepth, [1, 20]) \
    .addGrid(classifier.maxBins, [40, 300]) \
    .addGrid(classifier.minInfoGain, [0.0, 0.05]) \
    .build()

# Setting up the evaluator for model tuning to use accuracy as the metric
multiclassEval = MulticlassClassificationEvaluator() \
    .setLabelCol("Cover_Type") \
    .setPredictionCol("prediction") \
    .setMetricName("accuracy")

from pyspark.ml.tuning import TrainValidationSplit

# Configuring train-validation split for model selection
validator = TrainValidationSplit(seed=1234,
                                 estimator=pipeline,
                                 evaluator=multiclassEval,
                                 estimatorParamMaps=paramGrid,
                                 trainRatio=0.9)

# Fitting the train-validation split to find the best model parameters
validator_model = validator.fit(train_data)

from pprint import pprint

# Extracting and printing the best model parameters
best_model = validator_model.bestModel
pprint(best_model.stages[1].extractParamMap())

# Re-fitting the validator to retrieve metrics
validator_model = validator.fit(train_data)
metrics = validator_model.validationMetrics
params = validator_model.getEstimatorParamMaps()
metrics_and_params = list(zip(metrics, params))

# Sorting the metrics and parameters by performance and printing the best result
metrics_and_params.sort(key=lambda x: x[0], reverse=True)
metrics_and_params
metrics.sort(reverse=True)
print(metrics[0])

# Evaluating the best model on the test data set
multiclassEval.evaluate(best_model.transform(test_data))

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

# Defining a function to decode one-hot encoded vectors to single integer labels
def unencode_one_hot(data):
    wilderness_cols = ['Wilderness_Area_' + str(i) for i in range(4)]
    wilderness_assembler = VectorAssembler().\
        setInputCols(wilderness_cols).\
        setOutputCol("wilderness")
    unhot_udf = udf(lambda v: v.toArray().tolist().index(1))
    with_wilderness = wilderness_assembler.transform(data).\
        drop(*wilderness_cols).\
        withColumn("wilderness", unhot_udf(col("wilderness")).cast(IntegerType()))

    soil_cols = ['Soil_Type_' + str(i) for i in range(40)]
    soil_assembler = VectorAssembler().\
        setInputCols(soil_cols).\
        setOutputCol("soil")
    with_soil = soil_assembler.\
        transform(with_wilderness).\
        drop(*soil_cols).\
        withColumn("soil", unhot_udf(col("soil")).cast(IntegerType()))

    return with_soil

# Applying the function to unencode training data
unenc_train_data = unencode_one_hot(train_data)
unenc_train_data.printSchema()
unenc_train_data.groupBy('wilderness').count().show()

from pyspark.ml.feature import VectorIndexer

# Setting up the feature vector assembler and indexer for machine learning
cols = unenc_train_data.columns
input_cols = [c for c in cols if c!='Cover_Type']
assembler = VectorAssembler().setInputCols(input_cols).setOutputCol("featureVector")
indexer = VectorIndexer().setMaxCategories(40).setInputCol("featureVector").setOutputCol("indexedVector")

# Configuring the classifier to use indexed vectors
classifier = DecisionTreeClassifier().setLabelCol("Cover_Type").setFeaturesCol("indexedVector").setPredictionCol("prediction")
pipeline = Pipeline().setStages([assembler, indexer, classifier])

# Note indicating potential performance issues with the random forest classifier
0.0.5 Random Forests Takes Too Long To Run

from pyspark.ml.classification import RandomForestClassifier

# Setting up the random forest classifier
classifier = RandomForestClassifier(seed=1234, labelCol="Cover_Type", featuresCol="indexedVector", predictionCol="prediction")

# Reconfiguring the pipeline for random forests
pipeline = Pipeline().setStages([assembler, indexer, classifier])
paramGrid = ParamGridBuilder(). \
    addGrid(classifier.impurity, ["gini", "entropy"]). \
    addGrid(classifier.maxDepth, [1, 20]). \
    addGrid(classifier.maxBins, [40, 300]). \
    addGrid(classifier.minInfoGain, [0.0, 0.05]). \
    build()
multiclassEval = MulticlassClassificationEvaluator(). \
    setLabelCol("Cover_Type"). \
    setPredictionCol("prediction"). \
    setMetricName("accuracy")
validator = TrainValidationSplit(seed=1234,
                                  estimator=pipeline,
                                  evaluator=multiclassEval,
                                  estimatorParamMaps=paramGrid,
                                  trainRatio=0.9)
validator_model = validator.fit(unenc_train_data)

# Extracting the best model from the train-validation split
best_model = validator_model.bestModel
forest_model = best_model.stages[2]

# Listing and sorting feature importances from the random forest model
feature_importance_list = list(zip(input_cols, forest_model.featureImportances.toArray()))
feature_importance_list.sort(key=lambda x: x[1], reverse=True)
pprint(feature_importance_list)

# Transforming the test data using the best model and showing predictions
unenc_test_data = unencode_one_hot(test_data)
best_model.transform(unenc_test_data.drop("Cover_Type")).select("prediction").show(1)

# Additional setup for the pipeline to handle random forests
assembler = VectorAssembler().setInputCols(input_cols).setOutputCol("featureVector")
indexer = VectorIndexer().setMaxCategories(40).setInputCol("featureVector").setOutputCol("indexedVector")
pipeline = Pipeline().setStages([assembler, indexer, classifier])

# Building a parameter grid to optimize the RandomForestClassifier parameters
paramGrid = ParamGridBuilder(). \
    addGrid(classifier.impurity, ["gini", "entropy"]). \
    addGrid(classifier.maxDepth, [1, 20]). \
    addGrid(classifier.maxBins, [40, 300]). \
    addGrid(classifier.minInfoGain, [0.0, 0.05]). \
    build()

# Setting up the multiclass evaluator for accuracy metric
multiclassEval = MulticlassClassificationEvaluator(). \
    setLabelCol("Cover_Type"). \
    setPredictionCol("prediction"). \
    setMetricName("accuracy")

# Configuring a TrainValidationSplit for parameter tuning
validator = TrainValidationSplit(seed=1234,
                                  estimator=pipeline,
                                  evaluator=multiclassEval,
                                  estimatorParamMaps=paramGrid,
                                  trainRatio=0.9)

# Fitting the validator to the unencoded training data to find the best RandomForest model
validator_model = validator.fit(unenc_train_data)

# Extracting the best RandomForest model from the validation process
best_model = validator_model.bestModel
forest_model = best_model.stages[2]

# Printing the sorted feature importance list from the RandomForest model
feature_importance_list = list(zip(input_cols, forest_model.featureImportances.toArray()))
feature_importance_list.sort(key=lambda x: x[1], reverse=True)
pprint(feature_importance_list)

# Transforming the test data using the best model and showing the first prediction result
unenc_test_data = unencode_one_hot(test_data)
best_model.transform(unenc_test_data.drop("Cover_Type")).select("prediction").show(1)
