**easy linear regression**

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession\
        .builder\
        .appName("Regression")\
        .getOrCreate()

# load csv
reg_data = spark.read.csv("../data/regression.csv")
new_name = ["explaining", "label"]
reg_data = reg_data.toDF(*new_name)
reg_data = reg_data.select([reg_data[col].cast("float").alias(col) for col in reg_data.columns])
reg_data.show()

+----------+-----+
|explaining|label|
+----------+-----+
|       1.0|  5.0|
|       2.0| 10.0|
|       3.0| 15.0|
+----------+-----+



**1. To the reg_data, make linear regression model with condition that the column 'explaining' is explaining variable and the column 'label' is explained variable.**

In [2]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["explaining"],
    outputCol="features"
)

transformed_data = assembler.transform(reg_data)

lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(transformed_data)

**2. By the model you made above, do prediction to the data you used for the training.**

In [3]:
lrModel.transform(transformed_data).show()

+----------+-----+--------+------------------+
|explaining|label|features|        prediction|
+----------+-----+--------+------------------+
|       1.0|  5.0|   [1.0]| 5.362101675379325|
|       2.0| 10.0|   [2.0]|              10.0|
|       3.0| 15.0|   [3.0]|14.637898324620675|
+----------+-----+--------+------------------+



**logistic regression with iris dataset**

**3. Make Logistic regression model to iris data with standardization and proper pre-processing without Pipeline. The iris data must be splitted into train and test ones. Do prediction to the test data. As a first trial, it is okay to split the data into train and test ones just before the model training.**

In [4]:
# load iris
iris_data = spark.read.csv("../data/iris.csv")
new_name = ["sepal_length", "sepal_width", "petal_length", "petal_width", "species"]
iris_data = iris_data.toDF(*new_name)
iris_data = iris_data.select([iris_data[col].cast("float").alias(col) for col in iris_data.columns[:-1]] + [iris_data['species']])
iris_data.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.1|        3.5|         1.4|        0.2| setosa|
|         4.9|        3.0|         1.4|        0.2| setosa|
|         4.7|        3.2|         1.3|        0.2| setosa|
|         4.6|        3.1|         1.5|        0.2| setosa|
|         5.0|        3.6|         1.4|        0.2| setosa|
|         5.4|        3.9|         1.7|        0.4| setosa|
|         4.6|        3.4|         1.4|        0.3| setosa|
|         5.0|        3.4|         1.5|        0.2| setosa|
|         4.4|        2.9|         1.4|        0.2| setosa|
|         4.9|        3.1|         1.5|        0.1| setosa|
|         5.4|        3.7|         1.5|        0.2| setosa|
|         4.8|        3.4|         1.6|        0.2| setosa|
|         4.8|        3.0|         1.4|        0.1| setosa|
|         4.3|        3.0|         1.1| 

In [5]:
assembler_lr = VectorAssembler(inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"], 
                               outputCol="raw_features"
                              )

assembled_iris = assembler_lr.transform(iris_data)

In [6]:
from pyspark.ml.feature import StringIndexer, StandardScaler

scaler = StandardScaler(inputCol="raw_features",outputCol="features", withStd=True, withMean=True)
scaled_iris = scaler.fit(assembled_iris).transform(assembled_iris)

string_indexer = StringIndexer(inputCol="species", outputCol="label")
string_species_iris = string_indexer.fit(scaled_iris).transform(scaled_iris)

In [7]:
# train test split
train, test = string_species_iris.randomSplit([0.8, 0.2], seed=1234)

train.show()

+------------+-----------+------------+-----------+----------+--------------------+--------------------+-----+
|sepal_length|sepal_width|petal_length|petal_width|   species|        raw_features|            features|label|
+------------+-----------+------------+-----------+----------+--------------------+--------------------+-----+
|         4.4|        3.2|         1.3|        0.2|    setosa|[4.40000009536743...|[-1.7430169031234...|  2.0|
|         4.5|        2.3|         1.3|        0.3|    setosa|[4.5,2.2999999523...|[-1.6222537139525...|  2.0|
|         4.6|        3.1|         1.5|        0.2|    setosa|[4.59999990463256...|[-1.5014905247816...|  2.0|
|         4.6|        3.2|         1.4|        0.2|    setosa|[4.59999990463256...|[-1.5014905247816...|  2.0|
|         4.6|        3.4|         1.4|        0.3|    setosa|[4.59999990463256...|[-1.5014905247816...|  2.0|
|         4.6|        3.6|         1.0|        0.2|    setosa|[4.59999990463256...|[-1.5014905247816...|  2.0|
|

In [8]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr.fit(train).transform(test).select(["prediction", "label"]).show()

+----------+-----+
|prediction|label|
+----------+-----+
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       1.0|  0.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  0.0|
|       1.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



**4. Wrap the Logistic regression model flow above by PipeLine.**

In [9]:
from pyspark.ml import Pipeline
train, test = iris_data.randomSplit([0.8, 0.2], seed=1234)

pipeline = Pipeline(stages=[assembler_lr, scaler, string_indexer, lr])

model_lr = pipeline.fit(train)
model_lr.transform(test).select(["prediction", "label"]).show()

+----------+-----+
|prediction|label|
+----------+-----+
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       1.0|  0.0|
|       2.0|  2.0|
|       2.0|  2.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  0.0|
|       1.0|  0.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows

