In [1]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession


## Linear Regression

In [2]:
## starting spark session

In [3]:
spark = SparkSession.builder.appName('Logistic').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/11/18 21:23:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# import linear regression library

from pyspark.ml.regression import LinearRegression


In [6]:
data = spark.read.csv('diabetes.csv', header=True, inferSchema= True)
data.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome',
 '_c9',
 '_c10']

In [7]:
data = data.select(['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome'])
data.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          5|    116|           74|            0|      0|25.6|                   0.201| 30|      0|
|          3|     78|           50|           32|     88|31.0|                   0.248| 26|      1|


In [10]:
### assembling data for features

from pyspark.ml.feature import VectorAssembler


In [16]:
assembler = VectorAssembler(inputCols=['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome'], outputCol= 'features')

In [21]:
assembler_output = assembler.transform(data)

In [22]:
## splitting data for training and testing:

train, test = assembler_output.randomSplit([0.7,0.3])

In [42]:
### creating instance of linear

lr = LinearRegression(featuresCol='features', predictionCol='prediction',labelCol='Outcome')

In [43]:
training_model = lr.fit(train)

22/11/18 21:43:56 WARN Instrumentation: [e7a6b8ab] regParam is zero, which might cause numerical instability and overfitting.


In [44]:
test_results = training_model.evaluate(test)

In [45]:
predictions = training_model.transform(test)

In [46]:
predictions.columns

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome',
 'features',
 'prediction']

In [47]:
predictions.select(['Outcome','features','prediction']).show()

+-------+--------------------+--------------------+
|Outcome|            features|          prediction|
+-------+--------------------+--------------------+
|      0|[0.0,67.0,76.0,0....|7.947283997090007...|
|      0|[0.0,84.0,82.0,31...|-1.38858315980016...|
|      0|[0.0,86.0,68.0,32...|-1.80962276301640...|
|      0|[0.0,91.0,80.0,0....|-8.86767662138793...|
|      0|[0.0,100.0,70.0,2...|-1.92183125683275...|
|      0|[0.0,101.0,62.0,0...|-3.76441768744969...|
|      0|[0.0,102.0,86.0,1...|-8.83401309370951...|
|      0|[0.0,104.0,64.0,2...|-2.92615378224861...|
|      0|[0.0,105.0,64.0,4...|-8.50634289437921...|
|      0|[0.0,106.0,70.0,3...|-3.56740960232138...|
|      0|[0.0,107.0,60.0,2...|-3.10167934181041...|
|      0|[0.0,111.0,65.0,0...|-1.72642097326599...|
|      0|[0.0,117.0,80.0,3...|1.875528421591950...|
|      1|[0.0,119.0,0.0,0....|  0.9999999999999994|
|      0|[0.0,119.0,64.0,1...|-3.30474115731117...|
|      0|[0.0,123.0,88.0,3...|1.519736037248092...|
|      0|[0.

In [52]:
### evaluation of model



AttributeError: 'LinearRegressionModel' object has no attribute '_jdf'

In [57]:
training_model.coefficients

DenseVector([-0.0, 0.0, 0.0, -0.0, -0.0, 0.0, 0.0, 0.0, 1.0])

In [58]:
training_model.intercept

-1.918697960944907e-15

In [60]:
summary = training_model.summary

In [61]:
summary.residuals.show()

+--------------------+
|           residuals|
+--------------------+
|2.963395234916412...|
|9.01246453478582E-16|
|5.207224198493939...|
|1.139026961420308...|
|2.359879408427425...|
|1.550523457377119...|
|1.916169025316746...|
|3.472656164123637...|
|-3.12027025696706...|
|1.186992989604800...|
|4.833123429733153...|
|4.350091029779366...|
|7.382785709523493...|
|1.110223024625156...|
|1.264387957597854...|
|2.821813749146051...|
|7.14592800272112E-16|
|-2.92365494189167...|
|4.209245405609506...|
|3.436589185695711...|
+--------------------+
only showing top 20 rows

