<a href="https://colab.research.google.com/github/nickname8888/pyspark-prac/blob/main/pima_diabetes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("diabetes-eda").getOrCreate()
spark

In [3]:
df_spark = spark.read.csv("diabetes.csv", header=True, inferSchema=True)
df_spark

DataFrame[Pregnancies: int, Glucose: int, BloodPressure: int, SkinThickness: int, Insulin: int, BMI: double, DiabetesPedigreeFunction: double, Age: int, Outcome: int]

In [4]:
df_spark.show(5)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
only showing top 5 rows



In [5]:
df_spark.printSchema()

root
 |-- Pregnancies: integer (nullable = true)
 |-- Glucose: integer (nullable = true)
 |-- BloodPressure: integer (nullable = true)
 |-- SkinThickness: integer (nullable = true)
 |-- Insulin: integer (nullable = true)
 |-- BMI: double (nullable = true)
 |-- DiabetesPedigreeFunction: double (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Outcome: integer (nullable = true)



In [8]:
from pyspark.sql.functions import col

df_spark.filter(col("Glucose") > 120).show(10)

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|
|          2|    197|           70|           45|    543|30.5|                   0.158| 53|      1|
|          8|    125|           96|            0|      0| 0.0|                   0.232| 54|      1|
|         10|    168|           74|            0|      0|38.0|                   0.537| 34|      1|
|         10|    139|           80|            0|      0|27.1|                   1.441| 57|      0|


In [18]:
df_spark.select("Outcome").distinct().show()

+-------+
|Outcome|
+-------+
|      1|
|      0|
+-------+



In [19]:
from pyspark.sql import functions as f

df_spark.select(f.mean("Glucose")).show()

+------------+
|avg(Glucose)|
+------------+
|120.89453125|
+------------+



In [20]:
df_spark.filter(col("Glucose").isNull()).show()

+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin|BMI|DiabetesPedigreeFunction|Age|Outcome|
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+
+-----------+-------+-------------+-------------+-------+---+------------------------+---+-------+



In [21]:
df_spark = df_spark.withColumn("BMI_Adjusted", col("BMI") * 1.1) # increase the BMI values by 10%

In [22]:
df_spark.show()

+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+------------------+
|Pregnancies|Glucose|BloodPressure|SkinThickness|Insulin| BMI|DiabetesPedigreeFunction|Age|Outcome|      BMI_Adjusted|
+-----------+-------+-------------+-------------+-------+----+------------------------+---+-------+------------------+
|          6|    148|           72|           35|      0|33.6|                   0.627| 50|      1| 36.96000000000001|
|          1|     85|           66|           29|      0|26.6|                   0.351| 31|      0|29.260000000000005|
|          8|    183|           64|            0|      0|23.3|                   0.672| 32|      1|25.630000000000003|
|          1|     89|           66|           23|     94|28.1|                   0.167| 21|      0|30.910000000000004|
|          0|    137|           40|           35|    168|43.1|                   2.288| 33|      1|47.410000000000004|
|          5|    116|           74|            0

In [23]:
# calculate the rate of diabetes for all people vs the rate of diabetes for people over 50

total_count = df_spark.count()
diabetes_count = df_spark.filter(col("Outcome") == 1).count()

rate_of_diabetes = (diabetes_count / total_count) * 100
print("Average rate of diabetes is ", rate_of_diabetes)

Rate of diabetes is  34.89583333333333


In [25]:
over_fifty_count = df_spark.filter(col("Age") >= 50).count()
over_fifty_diabetes_count = df_spark.filter((col("Age") >= 50) & (col("Outcome") == 1)).count()

over_fifty_rate_of_diabetes = (over_fifty_diabetes_count / over_fifty_count) * 100
print("Average rate of diabetes for people over 50 is ", over_fifty_rate_of_diabetes)

Average rate of diabetes for people over 50 is  48.31460674157304


In [28]:
diff = over_fifty_rate_of_diabetes - rate_of_diabetes
print("The difference between the two rates is", diff)

The difference between the two rates is 13.418773408239709
