<a href="https://colab.research.google.com/github/miomelliot/March-projects/blob/main/5_1_%D0%90%D0%BD%D0%B0%D0%BB%D0%B8%D1%82%D0%B8%D0%BA%D0%B0_%D0%B2_%D0%BC%D0%B5%D0%B4%D0%B8%D1%86%D0%B8%D0%BD%D0%B5_%D0%9C%D0%B5%D1%82%D1%80%D0%B8%D0%BA%D0%B8_%D0%B4%D0%B0%D0%BD%D0%BD%D1%8B%D1%85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install pyspark seaborn

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("IrisLR").getOrCreate()
spark

In [3]:
import os, urllib.request, shutil
iris_path = "/content/iris.csv"
if not os.path.exists(iris_path):
    url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv"
    with urllib.request.urlopen(url) as response, open(iris_path, 'wb') as out_file:
        shutil.copyfileobj(response, out_file)
print("Dataset ready at", iris_path)

Dataset ready at /content/iris.csv


In [4]:
iris_df = spark.read.csv("/content/iris.csv", header=True, inferSchema=True)
iris_df.show(5)
iris_df.printSchema()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
+------------+-----------+------------+-----------+-------+
only showing top 5 rows

root
 |-- sepal_length: double (nullable = true)
 |-- sepal_width: double (nullable = true)
 |-- petal_length: double (nullable = true)
 |-- petal_width: double (nullable = true)
 |-- species: string (nullable = true)



In [5]:
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

label_indexer = StringIndexer(inputCol="species", outputCol="label")

feature_cols = ["sepal_length", "sepal_width", "petal_length", "petal_width"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

lr = LogisticRegression(featuresCol="features", labelCol="label", maxIter=100, regParam=0.0, elasticNetParam=0.0, family="multinomial")

pipeline = Pipeline(stages=[label_indexer, assembler, lr])

In [6]:
train_df, test_df = iris_df.randomSplit([0.8, 0.2], seed=42)
print(f"Training set size: {train_df.count()}, Test set size: {test_df.count()}")

Training set size: 126, Test set size: 24


In [7]:
model = pipeline.fit(train_df)

In [8]:
def evaluate(df, split_name):
    preds = model.transform(df)
    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
    acc = evaluator.evaluate(preds)
    print(f"{split_name} accuracy: {acc:.4f}")
    return acc

train_acc = evaluate(train_df, "Train")
test_acc = evaluate(test_df, "Test")

Train accuracy: 0.9841
Test accuracy: 1.0000


In [9]:
spark.stop()