In [17]:
!pip install pyspark



In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, avg, when

spark = SparkSession.builder.appName("PatientHealthAnalysis").getOrCreate()

df = spark.read.csv("patients_data.csv", header=True, inferSchema=True)

df.show(5)

+--------------------+-------------------+---+------+---------+---------+----+-----------+------------+-----------+-----------+----------+
|          Patient_ID|               Name|Age|Gender|Weight_kg|Height_cm| BMI|BP_Systolic|BP_Diastolic|Sugar_Level|Cholesterol|Hemoglobin|
+--------------------+-------------------+---+------+---------+---------+----+-----------+------------+-----------+-----------+----------+
|bdd640fb-0667-4ad...|       Daniel Doyle| 21|  Male|    101.9|    162.2|38.7|        107|         107|       88.4|      248.1|      14.4|
|8b9d2434-e465-415...|     Javier Johnson| 22|Female|     52.1|    160.9|20.1|        154|          98|       74.8|      139.8|      15.2|
|9a1de644-815e-46d...|    Cristian Santos| 46|Female|     81.4|    163.9|30.3|         90|         108|      215.0|      239.6|      12.7|
|72ff5d2a-386e-4be...|       Rhonda Smith| 45|  Male|    117.0|    166.8|42.1|        101|          84|       87.4|      269.5|      14.8|
|6c307511-b2b9-437...|Chris

In [19]:
df.printSchema()

root
 |-- Patient_ID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Weight_kg: double (nullable = true)
 |-- Height_cm: double (nullable = true)
 |-- BMI: double (nullable = true)
 |-- BP_Systolic: integer (nullable = true)
 |-- BP_Diastolic: integer (nullable = true)
 |-- Sugar_Level: double (nullable = true)
 |-- Cholesterol: double (nullable = true)
 |-- Hemoglobin: double (nullable = true)



#Basic Statistics

In [25]:
df.describe().show()

+-------+--------------------+----------------+-----------------+------+------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+
|summary|          Patient_ID|            Name|              Age|Gender|         Weight_kg|         Height_cm|               BMI|      BP_Systolic|     BP_Diastolic|       Sugar_Level|      Cholesterol|        Hemoglobin|
+-------+--------------------+----------------+-----------------+------+------------------+------------------+------------------+-----------------+-----------------+------------------+-----------------+------------------+
|  count|               10000|           10000|            10000| 10000|             10000|             10000|             10000|            10000|            10000|             10000|            10000|             10000|
|   mean|                NULL|            NULL|          53.7921|  NULL| 85.36331999999997| 174.8448299999999|28

In [20]:
df.select(
    count("*").alias("Total Patients"),
    avg("Age").alias("Avg Age"),
    avg("BMI").alias("Avg BMI"),
    avg("BP_Systolic").alias("Avg Systolic BP"),
    avg("BP_Diastolic").alias("Avg Diastolic BP"),
    avg("Sugar_Level").alias("Avg Sugar Level"),
    avg("Cholesterol").alias("Avg Cholesterol"),
    avg("Hemoglobin").alias("Avg Hemoglobin")
).show()

+--------------+-------+------------------+---------------+----------------+------------------+-----------------+------------------+
|Total Patients|Avg Age|           Avg BMI|Avg Systolic BP|Avg Diastolic BP|   Avg Sugar Level|  Avg Cholesterol|    Avg Hemoglobin|
+--------------+-------+------------------+---------------+----------------+------------------+-----------------+------------------+
|         10000|53.7921|28.498029999999964|       135.3349|         89.9937|159.66107000000008|200.0592500000007|13.978190000000001|
+--------------+-------+------------------+---------------+----------------+------------------+-----------------+------------------+



In [21]:
# Count patients with high/low BMI
df.select(
    count(when(col("BMI") < 18.5, True)).alias("Underweight"),
    count(when((col("BMI") >= 18.5) & (col("BMI") <= 24.9), True)).alias("Normal Weight"),
    count(when(col("BMI") >= 25, True)).alias("Overweight")
).show()

# Count patients with abnormal blood pressure
df.select(
    count(when(col("BP_Systolic") > 140, True)).alias("High BP (Hypertension)"),
    count(when(col("BP_Systolic") < 90, True)).alias("Low BP"),
    count(when((col("BP_Systolic") >= 90) & (col("BP_Systolic") <= 120), True)).alias("Normal BP")
).show()

# Count patients with high sugar levels (potential diabetes risk)
df.select(
    count(when(col("Sugar_Level") > 140, True)).alias("Diabetes Risk"),
    count(when(col("Sugar_Level") < 70, True)).alias("Low Sugar"),
    count(when((col("Sugar_Level") >= 70) & (col("Sugar_Level") <= 140), True)).alias("Normal Sugar Level")
).show()

# Count patients with high cholesterol
df.select(
    count(when(col("Cholesterol") > 200, True)).alias("High Cholesterol"),
    count(when(col("Cholesterol") < 100, True)).alias("Low Cholesterol"),
    count(when((col("Cholesterol") >= 100) & (col("Cholesterol") <= 200), True)).alias("Normal Cholesterol")
).show()

+-----------+-------------+----------+
|Underweight|Normal Weight|Overweight|
+-----------+-------------+----------+
|       1142|         2635|      6223|
+-----------+-------------+----------+

+----------------------+------+---------+
|High BP (Hypertension)|Low BP|Normal BP|
+----------------------+------+---------+
|                  4494|     0|     3326|
+----------------------+------+---------+

+-------------+---------+------------------+
|Diabetes Risk|Low Sugar|Normal Sugar Level|
+-------------+---------+------------------+
|         6097|        0|              3903|
+-------------+---------+------------------+

+----------------+---------------+------------------+
|High Cholesterol|Low Cholesterol|Normal Cholesterol|
+----------------+---------------+------------------+
|            5005|              0|              4995|
+----------------+---------------+------------------+

