In [1]:
import random
import pandas as pd

# Function to generate random patient profiles
def generate_patient_profiles(num_patients=10000):
    profiles = []
    for i in range(num_patients):
        name = f"Patient_{i+1}"
        age = random.randint(18, 80)
        gender = random.choice(['Male', 'Female'])
        bp = random.uniform(80, 120), random.uniform(50, 80)
        sugar_level = random.uniform(70, 140)
        cholesterol = random.uniform(125, 200)
        haemoglobin = random.uniform(12, 18)
        profiles.append({
            'Name': name,
            'Age': age,
            'Gender': gender,
            'BP': bp,
            'Sugar Level': sugar_level,
            'Cholesterol': cholesterol,
            'Haemoglobin': haemoglobin
        })
    return profiles

# Generate profiles
profiles = generate_patient_profiles()

# Convert to DataFrame
df = pd.DataFrame(profiles)

# Save to CSV
df.to_csv('patient_profiles.csv', index=False)

In [2]:
from pyspark.sql import SparkSession

# Create a Spark session
spark = SparkSession.builder.appName("HealthMonitoring").getOrCreate()

# Load the CSV file into a Spark DataFrame
df = spark.read.csv('patient_profiles.csv', header=True, inferSchema=True)

# Show the DataFrame
df.show()

# Basic analysis using Spark SQL
df.createOrReplaceTempView("patients")

# Example: Average sugar level
average_sugar_level = spark.sql("SELECT AVG(`Sugar Level`) AS avg_sugar_level FROM patients")
average_sugar_level.show()

# Example: Average cholesterol level
average_cholesterol_level = spark.sql("SELECT AVG(Cholesterol) AS avg_cholesterol_level FROM patients")
average_cholesterol_level.show()

# Example: Average haemoglobin level
average_haemoglobin_level = spark.sql("SELECT AVG(Haemoglobin) AS avg_haemoglobin_level FROM patients")
average_haemoglobin_level.show()

+----------+---+------+--------------------+------------------+------------------+------------------+
|      Name|Age|Gender|                  BP|       Sugar Level|       Cholesterol|       Haemoglobin|
+----------+---+------+--------------------+------------------+------------------+------------------+
| Patient_1| 36|Female|(113.529091650890...| 78.37021696676622|141.98031029155828| 17.58658736642242|
| Patient_2| 36|  Male|(99.4209607637263...| 79.45262325539666|161.48250288775625|13.176176586634828|
| Patient_3| 59|Female|(89.5116431492371...|115.30995213321907|184.16471411922575|14.180600245125646|
| Patient_4| 19|  Male|(84.2954191430270...| 70.93777296616305|154.59721595426066|15.576670987239979|
| Patient_5| 59|  Male|(82.7866797505711...|104.93765144269032|181.26268727386815|13.372250618630973|
| Patient_6| 39|Female|(94.2417651383286...|109.28247014994292|184.71583359322665|12.191969712275403|
| Patient_7| 73|  Male|(109.482744992358...| 74.26197953968475|197.10993191044415|

In [3]:
# Example: Count of patients by gender
gender_count = df.groupBy("Gender").count()
gender_count.show()

# Example: Average age of patients
average_age = df.agg({'Age': 'avg'}).collect()[0]
print(f"Average Age: {average_age['avg(Age)']}")

+------+-----+
|Gender|count|
+------+-----+
|Female| 5085|
|  Male| 4915|
+------+-----+

Average Age: 48.9392


In [4]:
# Save statistics to CSV for dashboard
statistics = {
    'Average Sugar Level': average_sugar_level.collect()[0]['avg_sugar_level'],
    'Average Cholesterol Level': average_cholesterol_level.collect()[0]['avg_cholesterol_level'],
    'Average Haemoglobin Level': average_haemoglobin_level.collect()[0]['avg_haemoglobin_level'],
    'Average Age': average_age['avg(Age)']
}

# Convert to DataFrame and save
stats_df = pd.DataFrame(list(statistics.items()), columns=['Statistic', 'Value'])
stats_df.to_csv('statistics.csv', index=False)

In [None]:
from google.colab import drive
drive.mount('/content/drive')