In [25]:
from pyspark.sql import SparkSession 


In [75]:
spark = SparkSession.builder.master("local").appName("pyspark-mllib-lr").getOrCreate()
spark.sparkContext

In [76]:
from pyspark.ml.classification import LogisticRegression

# binary classification in pySpark
* 共有九個變數 (八個自變數，一個應變數)
* Pregnancies: 懷孕次數
* Glucose: 葡萄糖
* BloodPressure:血壓
* SkinThickness: 皮膚厚度 
* Insulin: 胰島素
* BMI: 來衡量肥胖程度，其計算公式是以體重（公斤）除以身高（公尺）的平方
* DiabetesPedigreeFunction: 糖尿病家族函數
* Age: 年紀 (歲)
* Outcome: 類別結果 (0或1)

In [77]:
# Load raw data
raw_data = spark.read.format("csv").option("header","true").option("inferSchema", "true").load(r"/home/jovyan/dataset/diabetes.csv")
raw_data.columns


['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [78]:
raw_data.describe().select("Summary","Pregnancies","Glucose","BloodPressure").show()

+-------+------------------+-----------------+------------------+
|Summary|       Pregnancies|          Glucose|     BloodPressure|
+-------+------------------+-----------------+------------------+
|  count|               768|              768|               768|
|   mean|3.8450520833333335|     120.89453125|       69.10546875|
| stddev|  3.36957806269887|31.97261819513622|19.355807170644777|
|    min|                 0|                0|                 0|
|    max|                17|              199|               122|
+-------+------------------+-----------------+------------------+



In [79]:
raw_data.describe().select("Summary","SkinThickness","Insulin").show()

+-------+------------------+------------------+
|Summary|     SkinThickness|           Insulin|
+-------+------------------+------------------+
|  count|               768|               768|
|   mean|20.536458333333332| 79.79947916666667|
| stddev|15.952217567727642|115.24400235133803|
|    min|                 0|                 0|
|    max|                99|               846|
+-------+------------------+------------------+



In [80]:
raw_data.describe().select("Summary","BMI","DiabetesPedigreeFunction","Age").show()

+-------+------------------+------------------------+------------------+
|Summary|               BMI|DiabetesPedigreeFunction|               Age|
+-------+------------------+------------------------+------------------+
|  count|               768|                     768|               768|
|   mean|31.992578124999977|      0.4718763020833327|33.240885416666664|
| stddev| 7.884160320375441|       0.331328595012775|11.760231540678689|
|    min|               0.0|                   0.078|                21|
|    max|              67.1|                    2.42|                81|
+-------+------------------+------------------------+------------------+



# 發現有一些欄位的最小值為 0 
## 除了 Pregnancies 之外，其他的不切實際
* Pregnancies
* Glucose
* BloodPressure
* SkinThickness
* Insulin
* BMI

In [81]:
#將 min為0的值，改為 Nan
import numpy as np
from pyspark.sql.functions import when

In [82]:
raw_data=raw_data.withColumn("Glucose",when(raw_data.Glucose==0, np.nan).otherwise(raw_data.Glucose))

In [83]:
raw_data=raw_data.withColumn("BloodPressure",when(raw_data.BloodPressure==0, np.nan).otherwise(raw_data.BloodPressure))

In [84]:
raw_data=raw_data.withColumn("SkinThickness",when(raw_data.SkinThickness==0, np.nan).otherwise(raw_data.SkinThickness))

In [85]:
raw_data=raw_data.withColumn("Insulin",when(raw_data.Insulin==0, np.nan).otherwise(raw_data.Insulin))

In [86]:
raw_data=raw_data.withColumn("BMI",when(raw_data.BMI==0, np.nan).otherwise(raw_data.BMI))

In [87]:
raw_data.describe().select("Summary","Glucose","BloodPressure","SkinThickness","BMI","Insulin").show()

+-------+-------+-------------+-------------+----+-------+
|Summary|Glucose|BloodPressure|SkinThickness| BMI|Insulin|
+-------+-------+-------------+-------------+----+-------+
|  count|    768|          768|          768| 768|    768|
|   mean|    NaN|          NaN|          NaN| NaN|    NaN|
| stddev|    NaN|          NaN|          NaN| NaN|    NaN|
|    min|   44.0|         24.0|          7.0|18.2|   14.0|
|    max|    NaN|          NaN|          NaN| NaN|    NaN|
+-------+-------+-------------+-------------+----+-------+



In [88]:
from pyspark.ml.feature import Imputer

In [89]:
#預設有三種方式填充遺缺值 (平均值(mean)、中位數(median))
imputer=Imputer(inputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"],
                outputCols=["Glucose","BloodPressure","SkinThickness","BMI","Insulin"]).setStrategy("median")

In [90]:
model=imputer.fit(raw_data)
raw_data=model.transform(raw_data)
raw_data.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|  148.0|         72.0|         35.0|  125.0|33.6|                   0.627| 50|      1|
|          1|   85.0|         66.0|         29.0|  125.0|26.6|                   0.351| 31|      0|
|          8|  183.0|         64.0|         29.0|  125.0|23.3|                   0.672| 32|      1|
|          1|   89.0|         66.0|         23.0|   94.0|28.1|                   0.167| 21|      0|
|          0|  137.0|         40.0|         35.0|  168.0|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [91]:
cols=raw_data.columns
cols.remove("Outcome")

In [94]:
#將資料轉成一維向量
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=cols, outputCol="features")

In [93]:
raw_data=assembler.transform(raw_data)
raw_data.select("features").show(truncate=False)

+--------------------------------------------+
|features                                    |
+--------------------------------------------+
|[6.0,148.0,72.0,35.0,125.0,33.6,0.627,50.0] |
|[1.0,85.0,66.0,29.0,125.0,26.6,0.351,31.0]  |
|[8.0,183.0,64.0,29.0,125.0,23.3,0.672,32.0] |
|[1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0]   |
|[0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0] |
|[5.0,116.0,74.0,29.0,125.0,25.6,0.201,30.0] |
|[3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0]   |
|[10.0,115.0,72.0,29.0,125.0,35.3,0.134,29.0]|
|[2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0] |
|[8.0,125.0,96.0,29.0,125.0,32.3,0.232,54.0] |
|[4.0,110.0,92.0,29.0,125.0,37.6,0.191,30.0] |
|[10.0,168.0,74.0,29.0,125.0,38.0,0.537,34.0]|
|[10.0,139.0,80.0,29.0,125.0,27.1,1.441,57.0]|
|[1.0,189.0,60.0,23.0,846.0,30.1,0.398,59.0] |
|[5.0,166.0,72.0,19.0,175.0,25.8,0.587,51.0] |
|[7.0,100.0,72.0,29.0,125.0,30.0,0.484,32.0] |
|[0.0,118.0,84.0,47.0,230.0,45.8,0.551,31.0] |
|[7.0,107.0,74.0,29.0,125.0,29.6,0.254,31.0] |
|[1.0,103.0,3

# Standard scalarizer

In [95]:
from pyspark.ml.feature import StandardScaler

In [98]:
standardscaler=StandardScaler().setInputCol("features").setOutputCol("Scaled_features")

In [99]:
raw_data=standardscaler.fit(raw_data).transform(raw_data)

In [102]:
# 建立一個新欄位，將既有的 features 欄位 Standard scalarizer 後，產生一個新的欄位 Scaled_features
raw_data.select("features","Scaled_features").show(5)

+--------------------+--------------------+
|            features|     Scaled_features|
+--------------------+--------------------+
|[6.0,148.0,72.0,3...|[1.78063837321943...|
|[1.0,85.0,66.0,29...|[0.29677306220323...|
|[8.0,183.0,64.0,2...|[2.37418449762590...|
|[1.0,89.0,66.0,23...|[0.29677306220323...|
|[0.0,137.0,40.0,3...|[0.0,4.5009104914...|
+--------------------+--------------------+
only showing top 5 rows



# 建立訓練集(80%)、測試集(20%)

In [103]:
train, test = raw_data.randomSplit([0.8,0.2], seed=123456)

# 檢查是否有imbalance的問題

In [111]:
#檢查是否有imbalance的問題
dataset_size=float(train.select("Outcome").count())
numPositives=train.select("Outcome").where("Outcome==1").count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)
print('為1: {}'.format(numPositives))
print('為0: {}'.format(numNegatives))
print('為1的百分比: {}'.format(per_ones))

為1: 221
為0: 402.0
為1的百分比: 35.47351524879615


In [113]:
# 的確有 imbalance的問題，為1的比例為35.47%，因此我們可以將為1的資料給予一個較大的權重
# BalancingRatio=numNegatives/dataset_size

BalancingRation=numNegatives/dataset_size
print('BalancingRatio={}'.format(BalancingRation))

BalancingRatio=0.6452648475120385


In [115]:
#為1: 我們放入classWeights，內容為BalancingRatio
#為0: 我們放入classWeights，內容為1-BalancingRatio
train=train.withColumn("classWeights",when(train.Outcome==1, BalancingRation).otherwise(1-BalancingRation))
train.select("classWeights").show(5)

+------------------+
|      classWeights|
+------------------+
|0.3547351524879615|
|0.3547351524879615|
|0.3547351524879615|
|0.3547351524879615|
|0.3547351524879615|
+------------------+
only showing top 5 rows



# Feature selection (採用 ChiSqSelector)

In [132]:
#Spark MLlib 具有三種方法找特徵值 (VectorSlicer/RFormula/ChiSqSelector:(Chi-Squared feature selection))
#Chi-Squared feature selection: 適用於目標值為類別型變數

from pyspark.ml.feature import ChiSqSelector

In [119]:
#fpr chooses all features whose p-values are below a threshold, thus controlling the false positive rate of selection.
css=ChiSqSelector(featuresCol='Scaled_features',outputCol='Aspect',labelCol='Outcome',fpr=0.05)

In [121]:
train=css.fit(train).transform(train)

In [122]:
test=css.fit(test).transform(test)

In [123]:
test.select("Aspect").show(5,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------+
|Aspect                                                                                                                                 |
+---------------------------------------------------------------------------------------------------------------------------------------+
|[0.0,2.2011752038500245,6.282735462564321,3.2987454102033076,1.4470429786115504,6.588921448662343,0.5855214518762558,3.911487613223074]|
|[0.0,2.7596823451254036,5.290724600054165,2.502496518085268,0.7640386927068986,5.207138804903132,1.6448927385183476,1.785679127775751] |
|[0.0,2.989655873885854,5.62139488755755,3.639994935396753,2.431032204067405,5.803487103788687,1.1499158410559458,2.125808485447323]    |
|[0.0,3.1210693188918257,6.613405750067706,5.118742877901684,1.065023632258101,5.3089543681274955,0.995990098552394,2.210840824865216]  |
|[0.0,3.252482763897797,5.95206517

# 建立羅吉斯回歸分類演算法

In [124]:
from pyspark.ml.classification import LogisticRegression

In [128]:
lr = LogisticRegression(labelCol="Outcome", featuresCol="Aspect", weightCol="classWeights", maxIter=10)

In [129]:
model = lr.fit(train)

In [130]:
predict_train=model.transform(train)
predict_test=model.transform(test)

In [131]:
predict_test.select("Outcome","prediction").show()

+-------+----------+
|Outcome|prediction|
+-------+----------+
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       0.0|
|      0|       1.0|
|      1|       0.0|
|      1|       0.0|
|      0|       0.0|
|      1|       1.0|
|      1|       1.0|
|      0|       0.0|
|      0|       1.0|
|      1|       1.0|
|      0|       1.0|
+-------+----------+
only showing top 20 rows



# 模型評估
*預設用ROC評估

In [137]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [141]:
evaluator=BinaryClassificationEvaluator(rawPredictionCol="rawPrediction",labelCol="Outcome")

In [143]:
predict_test.select("Outcome","rawPrediction","prediction","probability").show(5)

+-------+--------------------+----------+--------------------+
|Outcome|       rawPrediction|prediction|         probability|
+-------+--------------------+----------+--------------------+
|      0|[1.93251831726415...|       0.0|[0.87352789759305...|
|      0|[1.58456515697427...|       0.0|[0.82985008420462...|
|      0|[1.49187237243758...|       0.0|[0.81635913912982...|
|      0|[1.88417237871225...|       0.0|[0.86808963982654...|
|      0|[2.87706309674848...|       0.0|[0.94670086661773...|
+-------+--------------------+----------+--------------------+
only showing top 5 rows



In [146]:
print("The area under ROC for training set is {}".format(evaluator.evaluate(predict_train)))

The area under ROC for training set is 0.8351005155219385


In [147]:
print("The area under ROC for testing set is {}".format(evaluator.evaluate(predict_test)))

The area under ROC for testing set is 0.8712548849326966


# 超參數
* 模型優化(model tuning)

* aggregationDepth: 調整層次深度，該值越大，表示能更快處理資料(>=2的整數，預設為2)
* elasticNetParam: 正規化調整計算參數(0~1之間，預設為：0)
* family: 採用分類方式 (二元分類)
* featuresCol: 輸入訓練集、測試集之變數向量(預設為features)
* fitIntercept: 是否使用帶截距的回歸。(預設為true)
* labelCol: 目標欄位(預設為label)
* maxIter: 迭代算法最大迭代次數，搭配tol值一起設定的結束條件(預設為100)
* tol: 收斂容忍係數(convergence tolerance)，算法每次迭代後的比較阈值以確定算法是否結束，值越小，执行的迭代次數就越多(預設值1.0E-6)
* predictionCol: 輸出結果資料中最終判別類別名稱(預設值為prediction)
* probabilityCol: 輸出結果資料之Softmax函數機率值的名稱(預設為probability)
* rawPredictionCol: 輸出結果資料之回歸應變數名稱(預設為rawPrediction)
* regParam: 正規化懲罰程度參數(預設為0) 
* standardization: 是否在回歸前對自變數進行標準化處理(預設為True)
* threshold: 二元分類演算法之中根據該值將計算得到之邏輯函數機率值對應於分類類別中(預設為5)
* thresholds: 多元分類演算法中根據該組數值將計算後得到的Softmax函數機率對應於多元分類中(無預設值)
* weightCol: 輸入訓練集權重的名稱，如果輸入的訓練集包含權重，則採用帶權重的回歸模型，如果不包含，相當於全重是1.0 (無預設值)

In [148]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [152]:
#設計一個超參數grid
paramGrid=ParamGridBuilder() \
.addGrid(lr.aggregationDepth,[2,5,10]) \
.addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
.addGrid(lr.fitIntercept,[False, True]) \
.addGrid(lr.maxIter,[10,100,1000]) \
.addGrid(lr.regParam,[0.01, 0.5, 2.0]) \
.build()

# K-fold cross validation

In [155]:
#建立一個 5-fold CrossValidator
cv=CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

In [None]:
cvModel=cv.fit(train)

Exception ignored in: <function JavaWrapper.__del__ at 0x7f3d85cc11e0>
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LogisticRegression' object has no attribute '_java_obj'
Exception ignored in: <function JavaWrapper.__del__ at 0x7f3d85cc11e0>
Traceback (most recent call last):
  File "/usr/local/spark/python/pyspark/ml/wrapper.py", line 40, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
AttributeError: 'LogisticRegression' object has no attribute '_java_obj'


In [None]:
predict_train=cvModel.transform(train)
predict_test=cvModel.transform(test)
print("訓練資料集之ROC，經過CV後:{}".format(evaluate(predict_train)))
print("測試資料集之ROC，經過CV後:{}".format(evaluate(predict_test)))