In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.datasets import load_iris
import warnings
import tempfile
import shutil

In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession
warnings.simplefilter('ignore')

In [5]:
pyspark.__version__

'2.3.2'

In [6]:
tmp_path = tempfile.mkdtemp()

In [8]:
x, y = load_iris(return_X_y = True)

In [9]:
x.shape

(150, 4)

In [10]:
y.shape

(150,)

In [11]:
y = y.reshape(-1, 1)
y.shape

(150, 1)

In [14]:
data = np.concatenate((x, y),axis = 1)

In [18]:
df = pd.DataFrame(data, columns= ['a', 'b', 'c', 'd', 'label'])
df.head()

Unnamed: 0,a,b,c,d,label
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [19]:
df.to_csv(os.path.join(tmp_path, 'data.csv'), index=False)

In [20]:
'{}, files:{}'.format(tmp_path, os.listdir(tmp_path))

"C:\\Users\\GUANGQ~1.LU\\AppData\\Local\\Temp\\tmpo8vtcztq, files:['data.csv']"

In [21]:
spark = SparkSession.builder.getOrCreate()

In [22]:
df_spark = spark.read.format('csv').option('header', True).option('inferSchema', True).load(os.path.join(tmp_path, 'data.csv'))

In [23]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [24]:
df_spark.show()

+---+---+---+---+-----+
|  a|  b|  c|  d|label|
+---+---+---+---+-----+
|5.1|3.5|1.4|0.2|  0.0|
|4.9|3.0|1.4|0.2|  0.0|
|4.7|3.2|1.3|0.2|  0.0|
|4.6|3.1|1.5|0.2|  0.0|
|5.0|3.6|1.4|0.2|  0.0|
|5.4|3.9|1.7|0.4|  0.0|
|4.6|3.4|1.4|0.3|  0.0|
|5.0|3.4|1.5|0.2|  0.0|
|4.4|2.9|1.4|0.2|  0.0|
|4.9|3.1|1.5|0.1|  0.0|
|5.4|3.7|1.5|0.2|  0.0|
|4.8|3.4|1.6|0.2|  0.0|
|4.8|3.0|1.4|0.1|  0.0|
|4.3|3.0|1.1|0.1|  0.0|
|5.8|4.0|1.2|0.2|  0.0|
|5.7|4.4|1.5|0.4|  0.0|
|5.4|3.9|1.3|0.4|  0.0|
|5.1|3.5|1.4|0.3|  0.0|
|5.7|3.8|1.7|0.3|  0.0|
|5.1|3.8|1.5|0.3|  0.0|
+---+---+---+---+-----+
only showing top 20 rows



In [25]:
df_spark.printSchema()

root
 |-- a: double (nullable = true)
 |-- b: double (nullable = true)
 |-- c: double (nullable = true)
 |-- d: double (nullable = true)
 |-- label: double (nullable = true)



In [27]:
from pyspark.ml.feature import VectorAssembler

In [28]:
assembler = VectorAssembler(inputCols = ['a'], outputCol = 'a_vector')

In [29]:
df_spark_vector = assembler.transform(df_spark)
df_spark_vector.printSchema()

root
 |-- a: double (nullable = true)
 |-- b: double (nullable = true)
 |-- c: double (nullable = true)
 |-- d: double (nullable = true)
 |-- label: double (nullable = true)
 |-- a_vector: vector (nullable = true)



In [31]:
df_spark_vector.show(5)

+---+---+---+---+-----+--------+
|  a|  b|  c|  d|label|a_vector|
+---+---+---+---+-----+--------+
|5.1|3.5|1.4|0.2|  0.0|   [5.1]|
|4.9|3.0|1.4|0.2|  0.0|   [4.9]|
|4.7|3.2|1.3|0.2|  0.0|   [4.7]|
|4.6|3.1|1.5|0.2|  0.0|   [4.6]|
|5.0|3.6|1.4|0.2|  0.0|   [5.0]|
+---+---+---+---+-----+--------+
only showing top 5 rows



In [32]:
from pyspark.ml.feature import StandardScaler

In [34]:
scaler = StandardScaler(inputCol = 'a_vector', outputCol = 'a_standard', withMean = True, withStd= True )

In [35]:
scaler_model = scaler.fit(df_spark_vector)

In [36]:
df_scaler = scaler_model.transform(df_spark_vector)

In [37]:
df_scaler.show(5, truncate=False)

+---+---+---+---+-----+--------+---------------------+
|a  |b  |c  |d  |label|a_vector|a_standard           |
+---+---+---+---+-----+--------+---------------------+
|5.1|3.5|1.4|0.2|0.0  |[5.1]   |[-0.8976738791967643]|
|4.9|3.0|1.4|0.2|0.0  |[4.9]   |[-1.1392004834649512]|
|4.7|3.2|1.3|0.2|0.0  |[4.7]   |[-1.3807270877331392]|
|4.6|3.1|1.5|0.2|0.0  |[4.6]   |[-1.5014903898672336]|
|5.0|3.6|1.4|0.2|0.0  |[5.0]   |[-1.0184371813308577]|
+---+---+---+---+-----+--------+---------------------+
only showing top 5 rows



In [38]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

In [39]:
def convert_bin(data):
    if data>=1.5:
        return 1.0
    else:
        return 0.0

In [40]:
convert_udf = udf(convert_bin, DoubleType())

In [41]:
df_scaler_udf = df_scaler.withColumn('c_bins', convert_udf(df_scaler['c']))

In [47]:
df_scaler_udf.show(5, truncate=False)

+---+---+---+---+-----+--------+---------------------+------+
|a  |b  |c  |d  |label|a_vector|a_standard           |c_bins|
+---+---+---+---+-----+--------+---------------------+------+
|5.1|3.5|1.4|0.2|0.0  |[5.1]   |[-0.8976738791967643]|0.0   |
|4.9|3.0|1.4|0.2|0.0  |[4.9]   |[-1.1392004834649512]|0.0   |
|4.7|3.2|1.3|0.2|0.0  |[4.7]   |[-1.3807270877331392]|0.0   |
|4.6|3.1|1.5|0.2|0.0  |[4.6]   |[-1.5014903898672336]|1.0   |
|5.0|3.6|1.4|0.2|0.0  |[5.0]   |[-1.0184371813308577]|0.0   |
+---+---+---+---+-----+--------+---------------------+------+
only showing top 5 rows



In [49]:
assembler_ml = VectorAssembler(inputCols = ['a', 'b', 'c', 'd', 'a_standard', 'c_bins'], outputCol= 'features')

In [50]:
df_train = assembler_ml.transform(df_scaler_udf)

In [51]:
df_train.show(5, truncate= False)

+---+---+---+---+-----+--------+---------------------+------+-----------------------------------------+
|a  |b  |c  |d  |label|a_vector|a_standard           |c_bins|features                                 |
+---+---+---+---+-----+--------+---------------------+------+-----------------------------------------+
|5.1|3.5|1.4|0.2|0.0  |[5.1]   |[-0.8976738791967643]|0.0   |[5.1,3.5,1.4,0.2,-0.8976738791967643,0.0]|
|4.9|3.0|1.4|0.2|0.0  |[4.9]   |[-1.1392004834649512]|0.0   |[4.9,3.0,1.4,0.2,-1.1392004834649512,0.0]|
|4.7|3.2|1.3|0.2|0.0  |[4.7]   |[-1.3807270877331392]|0.0   |[4.7,3.2,1.3,0.2,-1.3807270877331392,0.0]|
|4.6|3.1|1.5|0.2|0.0  |[4.6]   |[-1.5014903898672336]|1.0   |[4.6,3.1,1.5,0.2,-1.5014903898672336,1.0]|
|5.0|3.6|1.4|0.2|0.0  |[5.0]   |[-1.0184371813308577]|0.0   |[5.0,3.6,1.4,0.2,-1.0184371813308577,0.0]|
+---+---+---+---+-----+--------+---------------------+------+-----------------------------------------+
only showing top 5 rows



In [52]:
df_train_selected = df_train.select(['features', 'label'])

In [54]:
(train_data, test_data) = df_train_selected.randomSplit([0.8, 0.2])

In [55]:
train_data.count()

130

In [56]:
test_data.count()

20

In [61]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=50, regParam = 0.3, elasticNetParam=0.8)

In [62]:
lr_model = lr.fit(train_data)

In [63]:
pred = lr_model.transform(test_data)

In [65]:
pred.show(5, truncate = False)

+-----------------------------------------+-----+----------------------------------------------------------------+-----------------------------------------------------------+----------+
|features                                 |label|rawPrediction                                                   |probability                                                |prediction|
+-----------------------------------------+-----+----------------------------------------------------------------+-----------------------------------------------------------+----------+
|[4.4,3.0,1.3,0.2,-1.7430169941354205,0.0]|0.0  |[0.553290699802792,-0.22434544395498854,-0.5918681350258309]    |[0.5625359150491913,0.2584801333091901,0.17898395164161865]|0.0       |
|[4.9,3.0,1.4,0.2,-1.1392004834649512,0.0]|0.0  |[0.5314930920943604,-0.22434544395498854,-0.5918681350258309]   |[0.557164648832679,0.2616538009437328,0.18118155022358817] |0.0       |
|[5.0,3.5,1.3,0.3,-1.0184371813308577,0.0]|0.0  |[0.5245005712700129,-