In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [59.5 kB]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,107 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:9 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,610 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2,

In [13]:
# Import Required Libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType
from pyspark.ml.classification import LinearSVC, OneVsRest
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.functions import when

# Initialize SparkSession
spark = SparkSession.builder \
            .appName("SVM") \
            .config("spark.ui.port", "4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

# Define the schema of the iris dataset
iris_schema = StructType([
    StructField("Id", FloatType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])

# Load the iris dataset from Google Drive
iris_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("mode", "permissive") \
    .schema(iris_schema) \
    .load("/content/drive/MyDrive/ColabNotebooks/datasets/Iris.csv")

In [14]:
# Check the unique values in the Species column
iris_df.select("Species").distinct().show()

# Adjust the mapping based on actual species names
df_with_numeric_species = iris_df.withColumn(
    "Species",
    when(iris_df["Species"] == "Iris-setosa", 0)
    .when(iris_df["Species"] == "Iris-versicolor", 1)
    .when(iris_df["Species"] == "Iris-virginica", 2)
    .otherwise(None)
)

+---------------+
|        Species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [17]:
df_with_numeric_species = df_with_numeric_species.na.drop(subset=["Species"])

df_with_numeric_species = df_with_numeric_species.withColumn("Species", df_with_numeric_species["Species"].cast(IntegerType()))

# Vectorize Features
assembler = VectorAssembler(
    inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm", "PetalWidthCm"],
    outputCol="features"
)
df_with_features = assembler.transform(df_with_numeric_species).select("features", "Species")

df_with_features = df_with_features.withColumnRenamed("Species", "label")

In [18]:
# Step 6: Initialize and Train Model
lsvc = LinearSVC(maxIter=10, regParam=0.01)

# Set up OneVsRest for multi-class classification
ovr = OneVsRest(classifier=lsvc)

# Train the model
ovr_model = ovr.fit(df_with_features)

In [19]:
# Step 7: Evaluate the Model
predictions = ovr_model.transform(df_with_features)

# Use MulticlassClassificationEvaluator with f1 metric
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
f1_score = evaluator.evaluate(predictions)

In [20]:
# Display F1 score
print(f"F1 Score: {f1_score}")

F1 Score: 0.920045695045695


In [21]:
# Stop the Spark session
spark.stop()