In [1]:
from pyspark.sql import SparkSession;
from pyspark.context import SparkContext
spark = SparkSession \
    .builder \
    .appName("Logistic_Regression") \
    .getOrCreate()

sc = spark.sparkContext

In [2]:
spark

In [3]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.feature import VectorAssembler

In [4]:
import pandas as pd
df = pd.read_excel('./data/LR.xlsx')
df = spark.createDataFrame(df)
df.show(5)

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


+-------+-------+-----+--------------+
|Attempt|Success| Bait|         Place|
+-------+-------+-----+--------------+
|      1|      0|Bread|Grandma's Pond|
|      2|      1|Bread|Grandma's Pond|
|      3|      1|Bread|Grandma's Pond|
|      4|      1|Bread|Grandma's Pond|
|      5|      1|Bread|Grandma's Pond|
+-------+-------+-----+--------------+
only showing top 5 rows



- Before building the model, we need to assemble the input features into a single feature vector using the VectorAssembler class. Then, we will split the dataset into a training set (80%) and a testing set (20%).

* Indexing the category variables

In [5]:
from pyspark.ml.feature import OneHotEncoder, StringIndexer
indexed_df = df
for category in ['Bait','Place']:
    indexer = StringIndexer(inputCol=category, outputCol=str(category) +'_indexed')
    indexed_df = indexer.fit(indexed_df).transform(indexed_df)
indexed_df.show()

+-------+-------+--------+-----------------+------------+-------------+
|Attempt|Success|    Bait|            Place|Bait_indexed|Place_indexed|
+-------+-------+--------+-----------------+------------+-------------+
|      1|      0|   Bread|   Grandma's Pond|         0.0|          0.0|
|      2|      1|   Bread|   Grandma's Pond|         0.0|          0.0|
|      3|      1|   Bread|   Grandma's Pond|         0.0|          0.0|
|      4|      1|   Bread|   Grandma's Pond|         0.0|          0.0|
|      5|      1|   Bread|   Grandma's Pond|         0.0|          0.0|
|      6|      0|   Bread|   Grandma's Pond|         0.0|          0.0|
|      7|      1|Hot Dogs|   Grandma's Pond|         1.0|          0.0|
|      8|      0|Hot Dogs|   Grandma's Pond|         1.0|          0.0|
|      9|      1|Hot Dogs|   Grandma's Pond|         1.0|          0.0|
|     10|      1|Hot Dogs|   Grandma's Pond|         1.0|          0.0|
|     11|      1|Hot Dogs|   Grandma's Pond|         1.0|       

In [6]:
encoded_df = indexed_df
for category in ['Bait_indexed','Place_indexed']:
    encoder = OneHotEncoder(inputCol=category, outputCol=str(category) +'_onehot')
    encoded_df = encoder.fit(encoded_df).transform(encoded_df)
encoded_df.show()

+-------+-------+--------+-----------------+------------+-------------+-------------------+--------------------+
|Attempt|Success|    Bait|            Place|Bait_indexed|Place_indexed|Bait_indexed_onehot|Place_indexed_onehot|
+-------+-------+--------+-----------------+------------+-------------+-------------------+--------------------+
|      1|      0|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      2|      1|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      3|      1|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      4|      1|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      5|      1|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      6|      0|   Bread|   Grandma's Pond|         0.0|          0.0|      (1,[0],[1.0])|     

In [7]:
cols_to_drop = ['Attempt','Bait','Place','Bait_indexed','Place_indexed']
df = encoded_df
for col in cols_to_drop:
    df = df.drop(col)
df.show()

+-------+-------------------+--------------------+
|Success|Bait_indexed_onehot|Place_indexed_onehot|
+-------+-------------------+--------------------+
|      0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      0|      (1,[0],[1.0])|       (1,[0],[1.0])|
|      1|          (1,[],[])|       (1,[0],[1.0])|
|      0|          (1,[],[])|       (1,[0],[1.0])|
|      1|          (1,[],[])|       (1,[0],[1.0])|
|      1|          (1,[],[])|       (1,[0],[1.0])|
|      1|          (1,[],[])|       (1,[0],[1.0])|
|      1|          (1,[],[])|       (1,[0],[1.0])|
|      0|      (1,[0],[1.0])|           (1,[],[])|
|      0|      (1,[0],[1.0])|           (1,[],[])|
|      0|      (1,[0],[1.0])|           (1,[],[])|
|      0|      (1,[0],[1.0])|           (1,[],[])|
|      0|      (1,[0],[1.0])|  

In [8]:
assembler = VectorAssembler(inputCols=["Bait_indexed_onehot", "Place_indexed_onehot"], outputCol="features")
df = assembler.transform(df)

- splitting Data

In [9]:
train_data, test_data = df.randomSplit([0.8, 0.2], seed=42)

In [10]:
train_data.show(5)

+-------+-------------------+--------------------+---------+
|Success|Bait_indexed_onehot|Place_indexed_onehot| features|
+-------+-------------------+--------------------+---------+
|      0|      (1,[0],[1.0])|       (1,[0],[1.0])|[1.0,1.0]|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|[1.0,1.0]|
|      1|      (1,[0],[1.0])|       (1,[0],[1.0])|[1.0,1.0]|
|      0|      (1,[0],[1.0])|       (1,[0],[1.0])|[1.0,1.0]|
|      0|          (1,[],[])|       (1,[0],[1.0])|[0.0,1.0]|
+-------+-------------------+--------------------+---------+
only showing top 5 rows



In [13]:
rf_cls = RandomForestClassifier(numTrees=10, featuresCol="features", labelCol="Success")
model = rf_cls.fit(train_data)

In [15]:
predictions = model.transform(test_data)

In [18]:
# Accuracy, Precision, and Recall
multi_evaluator = MulticlassClassificationEvaluator(labelCol="Success", predictionCol="prediction")
accuracy = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "accuracy"})
precision = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedPrecision"})
recall = multi_evaluator.evaluate(predictions, {multi_evaluator.metricName: "weightedRecall"})


In [17]:

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")

Accuracy: 0.8333
Precision: 0.8667
Recall: 0.8333


- Gradient Boosted Trees build multiple decision trees sequentially, where each tree corrects the errors made by the previous trees.
- another ensemble learning technique that combines multiple weak learners to create a strong learner. AdaBoost assigns weights to the training instances and adjusts these weights at each iteration to focus on misclassified instances.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier, AdaBoostClassifier

# Initialize Spark session
spark = SparkSession.builder.appName("GradientBoostingAndAdaBoostExample").getOrCreate()

# Sample data
data = [(0, 2.0, 1.0, 1.0), (1, 3.0, 2.0, 0.0), (0, 4.0, 3.0, 0.0), (1, 5.0, 4.0, 1.0)]
columns = ["label", "feature1", "feature2", "feature3"]
df = spark.createDataFrame(data, columns)

# Assemble features into a single vector column
assembler = VectorAssembler(inputCols=["feature1", "feature2", "feature3"], outputCol="features")
df = assembler.transform(df)

# Train Gradient Boosted Trees classifier
gbt = GBTClassifier(featuresCol='features', labelCol='label', maxIter=10)
gbt_model = gbt.fit(df)

# Train AdaBoost classifier
adaBoost = AdaBoostClassifier(featuresCol='features', labelCol='label', maxIter=10)
adaBoost_model = adaBoost.fit(df)

# Make predictions with Gradient Boosted Trees
gbt_predictions = gbt_model.transform(df)
gbt_predictions.select("features", "label", "probability", "prediction").show()

# Make predictions with AdaBoost
adaBoost_predictions = adaBoost_model.transform(df)
adaBoost_predictions.select("features", "label", "probability", "prediction").show()

# Stop Spark session
spark.stop()
