In [1]:
# spark.conf.set("spark.sql.repl.eagerEval.enabled", True) #--> not working

from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

# TASK 2: Clone & Explore dataset

In [2]:
#clone the diabetes dataset from the github repository

# ! git clone https://github.com/education454/diabetes_dataset

In [5]:
#check if the dataset exists
# ! ls diabetes_dataset/

In [4]:
#create spark dataframe
df = spark.read.csv(f"gs://pyspark-cluster-bucket1/notebooks/jupyter/diabetes.csv", header=True, inferSchema=True)

                                                                                

In [6]:
#display the dataframe

df.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          2|    138|           62|           35|      0|33.6|                   0.127| 47|      1|
|          0|     84|           82|           31|    125|38.2|                   0.233| 23|      0|
|          0|    145|            0|            0|      0|44.2|                    0.63| 31|      1|
|          0|    135|           68|           42|    250|42.3|                   0.365| 24|      1|
|          1|    139|           62|           41|    480|40.7|                   0.536| 21|      0|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



                                                                                

In [7]:
#print the schema
df.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [8]:
#count the total no. of diabetic and non-diabetic class
df.groupBy('outcome').count().show()

[Stage 3:>                                                          (0 + 1) / 1]

+-------+-----+
|outcome|count|
+-------+-----+
|      1|  684|
|      0| 1316|
+-------+-----+



                                                                                

In [9]:
#get the summary statistics

df.describe().show()

                                                                                

+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|summary|      Pregnancies|           Glucose|     BloodPressure|    SkinThickness|          Insulin|               BMI|DiabetesPedigreeFunction|               Age|           Outcome|
+-------+-----------------+------------------+------------------+-----------------+-----------------+------------------+------------------------+------------------+------------------+
|  count|             2000|              2000|              2000|             2000|             2000|              2000|                    2000|              2000|              2000|
|   mean|           3.7035|          121.1825|           69.1455|           20.935|           80.254|32.192999999999984|     0.47092999999999974|           33.0905|             0.342|
| stddev|3.306063032730656|32.068635649902916|19.188314815604098|16.103242909926

# TASK 3: Data Cleaning & Preparation

In [10]:
[x for x in df.columns]

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

In [11]:
#check for null values

for col in df.columns:
    print(col, " : ", df[df[col].isNull()].count() )

Pregnancies  :  0
Glucose  :  0
BloodPressure  :  0
SkinThickness  :  0
Insulin  :  0
BMI  :  0
DiabetesPedigreeFunction  :  0
Age  :  0
Outcome  :  0


In [12]:
#look for the unnecessary values present
columns_list = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

def count_zeros():
    for i in columns_list:
        print(i, " : ", df[df[i]==0].count())


In [13]:
count_zeros()

Glucose  :  13
BloodPressure  :  90
SkinThickness  :  573
Insulin  :  956
BMI  :  28


In [14]:
df.agg({'glucose':'mean'}).first()[0]   #head()[0]   #collect()[0][0]

121.1825

In [15]:
#calculate and replace the unnecessary values by the mean value

from pyspark.sql.functions import *

for i in df.columns[1:6]:
    data = df.agg({i:'mean'}).first()[0]
    print("mean value for {0} is {1}".format(i, data))
    
    df = df.withColumn(i, when(df[i]==0, int(data)).otherwise(df[i]))

mean value for Glucose is 121.1825
mean value for BloodPressure is 69.1455
mean value for SkinThickness is 20.935
mean value for Insulin is 80.254
mean value for BMI is 32.192999999999984


In [16]:
#display the dataframe 
df.show(10,0)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|2          |138    |62           |35           |80     |33.6|0.127                   |47 |1      |
|0          |84     |82           |31           |125    |38.2|0.233                   |23 |0      |
|0          |145    |69           |20           |80     |44.2|0.63                    |31 |1      |
|0          |135    |68           |42           |250    |42.3|0.365                   |24 |1      |
|1          |139    |62           |41           |480    |40.7|0.536                   |21 |0      |
|0          |173    |78           |32           |265    |46.5|1.159                   |58 |0      |
|4          |99     |72           |17           |80     |25.6|0.294                   |28 |0      |


# TASK 4: Correlation Analysis & Feature Selection

In [17]:
#find the correlation among the set of input & output variables

for i in df.columns:
    print("corelation outcome for {0} is {1}".format(i, df.stat.corr('Outcome', i)))

corelation outcome for Pregnancies is 0.22443699263363961
corelation outcome for Glucose is 0.48796646527321064
corelation outcome for BloodPressure is 0.17171333286446713
corelation outcome for SkinThickness is 0.1659010662889893
corelation outcome for Insulin is 0.1711763270226193
corelation outcome for BMI is 0.2827927569760082
corelation outcome for DiabetesPedigreeFunction is 0.1554590791569403
corelation outcome for Age is 0.23650924717620253
corelation outcome for Outcome is 1.0


In [18]:
#feature selection

from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'], outputCol='features')

output_data = assembler.transform(df)

In [19]:
#print the schema
output_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)
 |-- features: vector (nullable = true)



In [20]:
#display dataframe

output_data.show(5,0)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-------------------------------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|Outcome|features                                   |
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+-------------------------------------------+
|2          |138    |62           |35           |80     |33.6|0.127                   |47 |1      |[2.0,138.0,62.0,35.0,80.0,33.6,0.127,47.0] |
|0          |84     |82           |31           |125    |38.2|0.233                   |23 |0      |[0.0,84.0,82.0,31.0,125.0,38.2,0.233,23.0] |
|0          |145    |69           |20           |80     |44.2|0.63                    |31 |1      |[0.0,145.0,69.0,20.0,80.0,44.2,0.63,31.0]  |
|0          |135    |68           |42           |250    |42.3|0.365                   |24 |1      |[0.0,135.0,68.0,42.0,250.0,42.3,0.365

# TASK 5: Split Dataset & Build the Model

In [21]:
#create final data

from pyspark.ml.classification import LogisticRegression

final_data = output_data.select('features', 'Outcome')

In [22]:
#print schema of final data

final_data.printSchema()

root
 |-- features: vector (nullable = true)
 |-- Outcome: integer (nullable = true)



In [23]:
#split the dataset ; build the model

train, test = final_data.randomSplit([0.7, 0.3])

models = LogisticRegression(labelCol='Outcome')

model = models.fit(train)

21/12/21 14:14:46 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
21/12/21 14:14:46 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [24]:
#summary of the model

summary = model.summary

summary.predictions.describe().show()

[Stage 126:>                                                        (0 + 1) / 1]

+-------+-------------------+------------------+
|summary|            Outcome|        prediction|
+-------+-------------------+------------------+
|  count|               1368|              1368|
|   mean|0.35014619883040937|0.2777777777777778|
| stddev|0.47718999692752506| 0.448067005358286|
|    min|                0.0|               0.0|
|    max|                1.0|               1.0|
+-------+-------------------+------------------+



                                                                                

# TASK 6: Evaluate and Save the Model

In [25]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

predictions = model.evaluate(test)

In [26]:
predictions.predictions.show(20,0)

+-------------------------------------------+-------+----------------------------------------+-----------------------------------------+----------+
|features                                   |Outcome|rawPrediction                           |probability                              |prediction|
+-------------------------------------------+-------+----------------------------------------+-----------------------------------------+----------+
|[0.0,57.0,60.0,20.0,80.0,21.7,0.735,67.0]  |0      |[4.279820630294866,-4.279820630294866]  |[0.9863439250776349,0.013656074922365091]|0.0       |
|[0.0,73.0,69.0,20.0,80.0,21.1,0.342,25.0]  |0      |[4.266051205536611,-4.266051205536611]  |[0.9861572091935069,0.013842790806493088]|0.0       |
|[0.0,74.0,52.0,10.0,36.0,27.8,0.269,22.0]  |0      |[3.6747923767759634,-3.6747923767759634]|[0.9752722936207449,0.024727706379255143]|0.0       |
|[0.0,78.0,88.0,29.0,40.0,36.9,0.434,21.0]  |0      |[2.7171857297811863,-2.7171857297811863]|[0.938033150283825

In [29]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol='rawPrediction', labelCol='Outcome')

evaluator.evaluate(model.transform(test))

0.8349460215913632

In [45]:
# save model

# model.save(f"gs://pyspark-cluster-bucket1/notebooks/jupyter/model")

# model.save('model')

In [31]:
# load saved model back to the environment

from pyspark.ml.classification import LogisticRegressionModel

model = LogisticRegressionModel.load(f"gs://pyspark-cluster-bucket1/notebooks/jupyter/model")

                                                                                

In [32]:
model

LogisticRegressionModel: uid=LogisticRegression_0e35eeae21c9, numClasses=2, numFeatures=8

# TASK 7: Prediction on New Data with the saved model

In [35]:
#create a new spark dataframe

df_new = spark.read.csv('gs://pyspark-cluster-bucket1/notebooks/jupyter/new_test.csv', header=True, inferSchema=True)

In [36]:
#print the schema
df_new.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)



In [38]:
#create an additional feature merged column 

test_data = assembler.transform(df_new)

In [39]:
#print the schema

test_data.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- features: vector (nullable = true)



In [41]:
test_data.show(5, 0)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------------------------------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|features                                   |
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------------------------------------------+
|1          |190    |78           |38           |150    |45.1|0.153                   |48 |[1.0,190.0,78.0,38.0,150.0,45.1,0.153,48.0]|
|0          |80     |84           |36           |120    |50.2|0.211                   |26 |[0.0,80.0,84.0,36.0,120.0,50.2,0.211,26.0] |
|2          |138    |82           |46           |255    |52.3|0.315                   |30 |[2.0,138.0,82.0,46.0,255.0,52.3,0.315,30.0]|
|1          |110    |63           |44           |480    |62.7|0.616                   |32 |[1.0,110.0,63.0,44.0,480.0,62.7,0.616,32.0]|
+-----------+-------+-------------+-------------

In [42]:
#use model to make predictions

results = model.transform(test_data)

In [43]:
#display the predictions

results.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [44]:
results.show(5,0)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------------------------------------------+----------------------------------------+----------------------------------------+----------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI |DiabetesPedigreeFunction|Age|features                                   |rawPrediction                           |probability                             |prediction|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------------------------------------------+----------------------------------------+----------------------------------------+----------+
|1          |190    |78           |38           |150    |45.1|0.153                   |48 |[1.0,190.0,78.0,38.0,150.0,45.1,0.153,48.0]|[-1.9471814109223917,1.9471814109223917]|[0.12486102200603927,0.8751389779939607]|1.0       |
|0          |80     |84           |36           |120    |50.2|0.211                 