In [1]:
import findspark
findspark.init("C:/spark") 

from pyspark import SparkContext
sc= SparkContext("local","pyspark week6")

In [2]:
from pyspark.sql import SparkSession
spark= SparkSession.builder.getOrCreate()

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

### Dataset description

- target variable is categorical variable : forest cover type (7 types)
- has geological features
- soil type is categorical having 40 types, wilderness area has 4 categorical values (one hot encoded)

### impl
- load csv file without header.
- we use the colnames
- using data in calling df, append all rows with that column name using `toDF` method.
    - convert target column to DoubleType() as pyspark.ml needs target values need to be of double datatype.
- train and test split is 90% and 10% . 
    * any transformation on rdd is recomputed everytime. We use .cache() which is used to store data (similar to views in sql). `train.cache()`, `test.cache()`
- Need tyo create feature vector out of all features present. 54 columns combined into one is feature vector. `VectorAssemler` creates the feature vector. then we call `transfrom ` method to construct the vector.

In [4]:
data_without_header= spark.read.option('header',False).option("InferSchema",True).csv('covtype.data')

In [5]:
data_without_header.show(3)

+----+---+---+---+---+----+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
| _c0|_c1|_c2|_c3|_c4| _c5|_c6|_c7|_c8| _c9|_c10|_c11|_c12|_c13|_c14|_c15|_c16|_c17|_c18|_c19|_c20|_c21|_c22|_c23|_c24|_c25|_c26|_c27|_c28|_c29|_c30|_c31|_c32|_c33|_c34|_c35|_c36|_c37|_c38|_c39|_c40|_c41|_c42|_c43|_c44|_c45|_c46|_c47|_c48|_c49|_c50|_c51|_c52|_c53|_c54|
+----+---+---+---+---+----+---+---+---+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+----+
|2596| 51|  3|258|  0| 510|221|232|148|6279|   1|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0|   0| 

##### numeriuc columns :  
* Elevation, Aspect, Slope, Horizontal_Distance_To_Hydrology, Vertical_Distance_To_Hydrology,   
Horizontal_Distance_To_Roadways, Hillshade_9am, Hillshade_Noon, Hillshade_3pm, Horizontal_Distance_To_Fire_Points

##### categorical data:
- Wilderness_Area-----------Categorical with 4 distinct values
- Soil_Type-----------------Categorical with 40 distinct values
- Cover_Type----------------Categorical with 7 distinct values (`target variable`)


In [6]:
columns= [
    "Elevation", "Aspect", "Slope","Horizontal_Distance_To_Hydrology", "Vertical_Distance_To_Hydrology",
    "Horizontal_Distance_To_Roadways", "Hillshade_9am", "Hillshade_Noon", "Hillshade_3pm"," Horizontal_Distance_To_Fire_Points",
    *[f"Wilderness_Area{i+1}" for i in range(4)],
    *[f"Soil_Type{i+1}" for i in range(40)],
    "Cover_Type"
]
print(len(columns))


55


In [7]:
df_data= data_without_header.toDF(*columns).withColumn("Cover_Type", col("Cover_Type").cast(DoubleType()))
df_data.show(3)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+-----------------------------------+----------------+----------------+----------------+----------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm| Horizontal_Distance_To_Fire_Points|Wilderness_Area1|Wilderness_Area2|Wilderness_Area3|

In [8]:
from pyspark.ml.feature import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [9]:
vector_cols= columns[:-1]
len(vector_cols)

54

In [10]:
featureVector= VectorAssembler(
    inputCols=vector_cols, outputCol="features"
)
featureVector

VectorAssembler_046076a4e634

In [13]:
# fin_out= featureVector.transform(df_data)
# fin_out.show(5)

In [11]:
(train_data, test_data)= df_data.randomSplit([0.9,0.1])
train_data.cache()
test_data.cache()

DataFrame[Elevation: int, Aspect: int, Slope: int, Horizontal_Distance_To_Hydrology: int, Vertical_Distance_To_Hydrology: int, Horizontal_Distance_To_Roadways: int, Hillshade_9am: int, Hillshade_Noon: int, Hillshade_3pm: int,  Horizontal_Distance_To_Fire_Points: int, Wilderness_Area1: int, Wilderness_Area2: int, Wilderness_Area3: int, Wilderness_Area4: int, Soil_Type1: int, Soil_Type2: int, Soil_Type3: int, Soil_Type4: int, Soil_Type5: int, Soil_Type6: int, Soil_Type7: int, Soil_Type8: int, Soil_Type9: int, Soil_Type10: int, Soil_Type11: int, Soil_Type12: int, Soil_Type13: int, Soil_Type14: int, Soil_Type15: int, Soil_Type16: int, Soil_Type17: int, Soil_Type18: int, Soil_Type19: int, Soil_Type20: int, Soil_Type21: int, Soil_Type22: int, Soil_Type23: int, Soil_Type24: int, Soil_Type25: int, Soil_Type26: int, Soil_Type27: int, Soil_Type28: int, Soil_Type29: int, Soil_Type30: int, Soil_Type31: int, Soil_Type32: int, Soil_Type33: int, Soil_Type34: int, Soil_Type35: int, Soil_Type36: int, S

In [12]:
fin_out= featureVector.transform(train_data)
fin_out.show(5)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+-----------------------------------+----------------+----------------+----------------+----------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+--------------------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm| Horizontal_Distance_To_Fire_Points|Wilderness_Area1|Wilderness_Ar

In [15]:
fin_out.select('features').show(2, truncate=False)

+--------------------------------------------------------------------------------------------------+
|features                                                                                          |
+--------------------------------------------------------------------------------------------------+
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1859.0,18.0,12.0,67.0,11.0,90.0,211.0,215.0,139.0,792.0,1.0,1.0])|
|(54,[0,1,2,3,4,5,6,7,8,9,13,15],[1861.0,35.0,14.0,60.0,11.0,85.0,218.0,209.0,124.0,832.0,1.0,1.0])|
+--------------------------------------------------------------------------------------------------+
only showing top 2 rows



In [16]:
tree= DecisionTreeClassifier(
    labelCol='Cover_Type',
    featuresCol='features',
    maxDepth=15
)

model= tree.fit(fin_out)

In [17]:
preds= model.transform(fin_out)

preds.show(5)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+-----------------------------------+----------------+----------------+----------------+----------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+--------------------+--------------------+--------------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm| Horizontal_D

In [18]:
#print(model.toDebugString)

In [19]:

conf_matrix = preds.groupBy("Cover_Type") \
    .pivot("prediction") \
    .count() \
    .fillna(0)  # fill missing combinations with 0
conf_matrix.show()

+----------+------+------+-----+----+----+-----+-----+
|Cover_Type|   1.0|   2.0|  3.0| 4.0| 5.0|  6.0|  7.0|
+----------+------+------+-----+----+----+-----+-----+
|       7.0|  2536|   111|    0|   0|   0|    0|15775|
|       1.0|160954| 28735|   17|   0| 102|   25|  849|
|       4.0|     0|     0|  299|1985|   0|  191|    0|
|       3.0|    85|  2341|27121|  99|   2| 2572|    0|
|       2.0| 18942|233849|  708|   1| 421|  903|  221|
|       6.0|   118|  2816| 2099|  20|   2|10578|    0|
|       5.0|   420|  4413|   47|   0|3615|   18|    0|
+----------+------+------+-----+----+----+-----+-----+



In [20]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",      
    predictionCol="prediction", 
    metricName="accuracy"       
)

accuracy = evaluator.evaluate(preds)
print(f"train Accuracy = {accuracy:.4f}")


train Accuracy = 0.8679


In [21]:
test_out = featureVector.transform(test_data)

preds_test = model.transform(test_out)

test_acc = evaluator.evaluate(preds_test)

In [22]:
print(f"Test Accuracy  = {test_acc:.4f}")

Test Accuracy  = 0.8524


In [23]:
cf= preds_test.groupBy('Cover_Type').pivot('prediction').count().fillna(0)
cf.show()

+----------+-----+-----+----+---+---+----+----+
|Cover_Type|  1.0|  2.0| 3.0|4.0|5.0| 6.0| 7.0|
+----------+-----+-----+----+---+---+----+----+
|       7.0|  333|   16|   0|  0|  0|   0|1739|
|       1.0|17598| 3412|   6|  0| 13|   1| 128|
|       4.0|    0|    0|  33|206|  0|  33|   0|
|       3.0|    8|  287|2846| 24|  3| 366|   0|
|       2.0| 2282|25626| 103|  2| 79| 114|  50|
|       6.0|   14|  320| 311|  8|  0|1081|   0|
|       5.0|   52|  550|  13|  0|362|   3|   0|
+----------+-----+-----+----+---+---+----+----+



In [24]:
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [None]:
dt = DecisionTreeClassifier(
    labelCol="Cover_Type",
    featuresCol="features"
)

In [26]:
pipeline = Pipeline(stages=[featureVector, dt])

pargrid = (ParamGridBuilder()
    .addGrid(dt.maxDepth, [5, 10, 15])          #depth of tree
    .addGrid(dt.minInstancesPerNode, [1, 4, 5]) #minimum samples per leaf
    .addGrid(dt.impurity, ["gini", "entropy"])   # splitting based on
    .build()
)

evaluator = MulticlassClassificationEvaluator(
    labelCol="Cover_Type",
    predictionCol="prediction",
    metricName="accuracy"
)

In [27]:
tvs = TrainValidationSplit(
    estimator=pipeline,
    estimatorParamMaps=pargrid,
    evaluator=evaluator,
    trainRatio=0.8   # 80% train split (on training set only its splitted)
)

In [28]:
tvsModel = tvs.fit(train_data)

In [29]:
preds = tvsModel.transform(test_data)

# Accuracy
accuracy = evaluator.evaluate(preds)
print(f"Test Accuracy after tuning = {accuracy:.4f}")


Test Accuracy after tuning = 0.8524


In [30]:
# Best model params
bestModel = tvsModel.bestModel.stages[-1]  # last stage is DecisionTree
print("Best maxDepth:", bestModel.getOrDefault("maxDepth"))
print("Best minInstancesPerNode:", bestModel.getOrDefault("minInstancesPerNode"))
print("Best impurity:", bestModel.getOrDefault("impurity"))

Best maxDepth: 15
Best minInstancesPerNode: 1
Best impurity: entropy


In [31]:
preds.show(5)

+---------+------+-----+--------------------------------+------------------------------+-------------------------------+-------------+--------------+-------------+-----------------------------------+----------------+----------------+----------------+----------------+----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+-----------+----------+--------------------+--------------------+--------------------+----------+
|Elevation|Aspect|Slope|Horizontal_Distance_To_Hydrology|Vertical_Distance_To_Hydrology|Horizontal_Distance_To_Roadways|Hillshade_9am|Hillshade_Noon|Hillshade_3pm| Horizontal_D

In [32]:
cf= preds.groupBy('Cover_Type').pivot('prediction').count().fillna(0)
cf.show()

+----------+-----+-----+----+---+---+----+----+
|Cover_Type|  1.0|  2.0| 3.0|4.0|5.0| 6.0| 7.0|
+----------+-----+-----+----+---+---+----+----+
|       7.0|  284|   40|   0|  0|  0|   0|1764|
|       1.0|17479| 3554|   1|  0| 12|   1| 111|
|       4.0|    0|    0|  40|214|  0|  18|   0|
|       3.0|    0|  178|3080| 37|  6| 233|   0|
|       2.0| 2798|25129|  81|  0|110| 128|  10|
|       6.0|    3|  248| 223|  8|  1|1251|   0|
|       5.0|   10|  396|  27|  0|542|   5|   0|
+----------+-----+-----+----+---+---+----+----+

