# Machine Learning Quick Start

In [1]:
spark

In [2]:
import numpy as np
import pandas as pd
import pyspark
import sys

In [3]:
import pyspark.pandas as ps
import pyspark.sql.functions as fn

In [4]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StringIndexer

In [None]:
# Local mode
spark = SparkSession\
        .builder\
        .appName("iris")\
        .getOrCreate()

In [None]:
# yarn mode
spark = SparkSession\
        .builder\
        .master("yarn")\
        .config('spark.executor.instances','99')\
        .config('spark.executor.memory','4G')\
        .appName("iris")\
        .getOrCreate()

In [5]:
# Check spark app name
spark.sparkContext.appName

'PySparkShell'

In [6]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", True)

In [7]:
# print runtime versions
# Python version
sys.version

'3.8.10 (default, Sep 28 2021, 16:10:42) \n[GCC 9.3.0]'

In [8]:
# Spark version
spark.version

'3.2.0'

### Exploring Data

In [9]:
# load iris.csv into Spark dataframe
#df = spark.read.csv('file:///vagrant/data/iris.csv', header=True, inferSchema=True)
df = ps.read_csv('data/iris.csv')

                                                                                

In [10]:
# First 5 rows of Iris dataset
df.head(5)

21/10/25 05:13:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
21/10/25 05:13:53 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [11]:
df.dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [12]:
df.describe()

                                                                                

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.3,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [13]:
# number of records for each species available in the dataset
#df.groupBy('species').count()
df.groupby('species').count()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-virginica,50,50,50,50
Iris-setosa,50,50,50,50
Iris-versicolor,50,50,50,50


### Feature Engineering

In [14]:
df = df.to_spark()

In [15]:
# display all column names
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [16]:
# vectorize all numerical columns into a single feature column
feature_cols = df.columns[:-1]
assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
df = assembler.transform(df)

In [17]:
# convert text labels into indices
data = df.select(['features', 'species'])
label_indexer = StringIndexer(inputCol='species', outputCol='label').fit(data)
data = label_indexer.transform(data)

In [18]:
# only select the features and label column
data = data.select(['features', 'label'])

In [19]:
# Reading for machine learning
data.show(10)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[5.1,3.5,1.4,0.2]|  0.0|
|[4.9,3.0,1.4,0.2]|  0.0|
|[4.7,3.2,1.3,0.2]|  0.0|
|[4.6,3.1,1.5,0.2]|  0.0|
|[5.0,3.6,1.4,0.2]|  0.0|
|[5.4,3.9,1.7,0.4]|  0.0|
|[4.6,3.4,1.4,0.3]|  0.0|
|[5.0,3.4,1.5,0.2]|  0.0|
|[4.4,2.9,1.4,0.2]|  0.0|
|[4.9,3.1,1.5,0.1]|  0.0|
+-----------------+-----+
only showing top 10 rows



In [20]:
data.select(['label']).distinct().show()

+-----+
|label|
+-----+
|  0.0|
|  1.0|
|  2.0|
+-----+



### Split Data - Train & Test sets

In [21]:
# use Logistic Regression to train on the training set
train, test = data.randomSplit([0.70, 0.30], seed=42)

### Build Logistic Regression Model

In [None]:
# change regularization rate and you will likely get a different accuracy.
#reg = 0.01

In [22]:
#lr = LogisticRegression(regParam=reg)
lr = LogisticRegression()
model = lr.fit(train)

In [23]:
# predict on the test set
prediction = model.transform(test)

In [24]:
# print prediction
prediction.show(10)

+-----------------+-----+--------------------+--------------------+----------+
|         features|label|       rawPrediction|         probability|prediction|
+-----------------+-----+--------------------+--------------------+----------+
|[4.4,3.0,1.3,0.2]|  0.0|[57.9282836392713...|[1.0,5.0703690515...|       0.0|
|[4.6,3.2,1.4,0.2]|  0.0|[62.9606574586093...|[1.0,3.7754242894...|       0.0|
|[4.6,3.6,1.0,0.2]|  0.0|[82.0652768706601...|[1.0,2.5399240527...|       0.0|
|[4.7,3.2,1.3,0.2]|  0.0|[63.9088681812100...|[1.0,1.4396787676...|       0.0|
|[4.8,3.1,1.6,0.2]|  0.0|[56.2311096516647...|[1.0,3.2347529585...|       0.0|
|[4.8,3.4,1.6,0.2]|  0.0|[66.7097598115694...|[1.0,1.2822472278...|       0.0|
|[4.8,3.4,1.9,0.2]|  0.0|[62.8599454124360...|[1.0,1.2167775688...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[59.6681747054625...|[1.0,6.8182964663...|       0.0|
|[4.9,3.1,1.5,0.1]|  0.0|[59.6681747054625...|[1.0,6.8182964663...|       0.0|
|[5.0,2.3,3.3,1.0]|  1.0|[-14.108528506967...|[4.717

### Evaluate Model

In [25]:
# evaluate the accuracy of the model using the test set
evaluator = MulticlassClassificationEvaluator(metricName='accuracy')

In [26]:
accuracy = evaluator.evaluate(prediction)

In [27]:
# print accuracy 
accuracy

0.9782608695652174