In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

[33m0% [Working][0m            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [[0m[33m0% [Waiting for headers] [Connecting to security.ubuntu.com (185.125.190.83)] [[0m                                                                               Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:8 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [59.5 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, FloatType, StringType, IntegerType
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

# Create Spark Session in localhost
spark = SparkSession.builder.master("local").\
    appName("ITESO-LogisticRegression").\
    getOrCreate()

# Set spark context
sc = spark.sparkContext
sc.setLogLevel("ERROR")

iris_schema = StructType([
    StructField("Id", IntegerType(), True),
    StructField("SepalLengthCm", FloatType(), True),
    StructField("SepalWidthCm", FloatType(), True),
    StructField("PetalLengthCm", FloatType(), True),
    StructField("PetalWidthCm", FloatType(), True),
    StructField("Species", StringType(), True)
])



iris_df = spark.read.format("csv").\
    option("header", "true").\
    option("mode", "permissive").\
    option("path", "/content/drive/MyDrive/Iris.csv").\
    schema(iris_schema).\
    load()
iris_df.printSchema()

iris_df.show(n=10, truncate=False)

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)

+---+-------------+------------+-------------+------------+-----------+
|Id |SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species    |
+---+-------------+------------+-------------+------------+-----------+
|1  |5.1          |3.5         |1.4          |0.2         |Iris-setosa|
|2  |4.9          |3.0         |1.4          |0.2         |Iris-setosa|
|3  |4.7          |3.2         |1.3          |0.2         |Iris-setosa|
|4  |4.6          |3.1         |1.5          |0.2         |Iris-setosa|
|5  |5.0          |3.6         |1.4          |0.2         |Iris-setosa|
|6  |5.4          |3.9         |1.7          |0.4         |Iris-setosa|
|7  |4.6          |3.4         |1.4          |0.3         |Iris-setosa|
|8  |5.0          |3.4    

In [10]:
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(inputCol="Species", outputCol="label")
iris_df = label_indexer.fit(iris_df).transform(iris_df)
iris_df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- SepalLengthCm: float (nullable = true)
 |-- SepalWidthCm: float (nullable = true)
 |-- PetalLengthCm: float (nullable = true)
 |-- PetalWidthCm: float (nullable = true)
 |-- Species: string (nullable = true)
 |-- label: double (nullable = false)



In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Transform features into a single vector column
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["SepalLengthCm", "SepalWidthCm", "PetalLengthCm","PetalWidthCm"], outputCol="features")
df = assembler.transform(iris_df).select("label", "features")

# Split the data into training and testing sets (80% training, 20% testing)
train_df, test_df = df.randomSplit([0.8, 0.2], seed=47)

# Initialize the LinearSVC classifier for binary classification
lsvc = LinearSVC(maxIter=10, regParam=0.01)

# Set up OneVsRest classifier for multi-class classification
ovr = OneVsRest(classifier=lsvc)

# Train the model
ovr_model = ovr.fit(train_df)

# Make predictions on the test set
predictions = ovr_model.transform(test_df)

# Show predictions
predictions.show()

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(metricName="f1")
f1 = evaluator.evaluate(predictions)
print("Test F1 Score = ", f1)

# Stop the Spark session
spark.stop()

+-----+--------------------+--------------------+----------+
|label|            features|       rawPrediction|prediction|
+-----+--------------------+--------------------+----------+
|  0.0|[4.40000009536743...|[1.89002714587431...|       0.0|
|  0.0|[4.59999990463256...|[1.86239557440433...|       0.0|
|  0.0|[4.59999990463256...|[2.22994692208856...|       0.0|
|  0.0|[4.59999990463256...|[2.81315060198625...|       0.0|
|  0.0|[4.80000019073486...|[1.79357465667108...|       0.0|
|  0.0|[4.80000019073486...|[1.56938489116959...|       0.0|
|  0.0|[4.90000009536743...|[1.85213253746149...|       0.0|
|  0.0|[5.0,3.5999999046...|[2.46839265258734...|       0.0|
|  0.0|[5.09999990463256...|[2.09270853422532...|       0.0|
|  0.0|[5.09999990463256...|[2.62629466628475...|       0.0|
|  0.0|[5.19999980926513...|[2.19666996026440...|       0.0|
|  0.0|[5.40000009536743...|[1.87954352821025...|       0.0|
|  0.0|[5.40000009536743...|[2.40459296296489...|       0.0|
|  1.0|[5.5,2.2999999523