In [0]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# Initialize SparkSession (optional if already running in Databricks)
spark = SparkSession.builder.appName("DataAnalysis").getOrCreate()

# Query the Delta table from the catalog
df = spark.sql("SELECT * FROM final_project.default.global_health_statistics")

# Display the first few rows
df.show()




+------------+----+-------------------+----------------+-------------------+------------------+------------------+---------+------+-------------------+---------------------+----------------+----------------------+--------------+----------------------------+----------------------------------+-----------------+-----+--------------------------+-----------------------+---------------+---------------------+
|     Country|Year|       Disease Name|Disease Category|Prevalence Rate (%)|Incidence Rate (%)|Mortality Rate (%)|Age Group|Gender|Population Affected|Healthcare Access (%)|Doctors per 1000|Hospital Beds per 1000|Treatment Type|Average Treatment Cost (USD)|Availability of Vaccines/Treatment|Recovery Rate (%)|DALYs|Improvement in 5 Years (%)|Per Capita Income (USD)|Education Index|Urbanization Rate (%)|
+------------+----+-------------------+----------------+-------------------+------------------+------------------+---------+------+-------------------+---------------------+---------------

Top Diseases by Impact

In [0]:
# Top diseases by prevalence
df.groupBy("Disease Name").agg(mean("Prevalence Rate (%)").alias("Avg Prevalence Rate")).orderBy("Avg Prevalence Rate", ascending=False).show(10)

# Top diseases by mortality
df.groupBy("Disease Name").agg(mean("Mortality Rate (%)").alias("Avg Mortality Rate")).orderBy("Avg Mortality Rate", ascending=False).show(10)


+-------------------+-------------------+
|       Disease Name|Avg Prevalence Rate|
+-------------------+-------------------+
|             Rabies|  10.09890725362682|
|            Cholera| 10.088265438118167|
|           HIV/AIDS| 10.078832000636504|
|Parkinson's Disease| 10.069055282851867|
|Alzheimer's Disease|  10.06439455673082|
|             Dengue| 10.061142794646965|
|             Cancer| 10.058251566073372|
|              Ebola|  10.05788839249778|
|            Measles| 10.057735242078166|
|              Polio| 10.048862398911027|
+-------------------+-------------------+
only showing top 10 rows

+-------------------+------------------+
|       Disease Name|Avg Mortality Rate|
+-------------------+------------------+
|Parkinson's Disease|  5.06933773235697|
|             Cancer|5.0689332803022795|
|             Rabies| 5.063906353176595|
|          Influenza| 5.057569462529309|
|               Zika| 5.056438097509588|
|          Hepatitis| 5.054697618571135|
|Alzheimer's Dise

Effectiveness of Vaccines and Treatments

In [0]:
# Recovery rates by vaccine/treatment availability
df.groupBy("Availability of Vaccines/Treatment").agg(mean("Recovery Rate (%)").alias("Avg Recovery Rate")).show()


+----------------------------------+-----------------+
|Availability of Vaccines/Treatment|Avg Recovery Rate|
+----------------------------------+-----------------+
|                                No|74.48564974001535|
|                               Yes|74.50820187307393|
+----------------------------------+-----------------+



 Regional Analysis of a Disease

In [0]:
# Regional analysis for a specific disease
disease = "Tuberculosis"  # Replace with your disease of interest
df.filter(df["Disease Name"] == disease).groupBy("Country").agg(mean("Prevalence Rate (%)").alias("Avg Prevalence")).orderBy("Avg Prevalence", ascending=False).show()


+------------+------------------+
|     Country|    Avg Prevalence|
+------------+------------------+
|       Italy|10.226711327649205|
|       India|10.223795761078993|
|       Japan|10.140816246056778|
|      Canada|10.121070853462157|
|      Russia|10.112215312864361|
|      Brazil| 10.10954102355808|
|      Mexico|10.094845779220782|
|   Indonesia|10.043700950020657|
| South Korea| 10.02653784860558|
|   Australia|10.011761942051685|
|      Turkey| 9.988912956277575|
|South Africa|  9.97534524776604|
|     Nigeria|  9.97390996248437|
|          UK| 9.963371496249506|
|       China| 9.918346938775514|
|Saudi Arabia| 9.918204704911265|
|         USA| 9.876732793522269|
|   Argentina| 9.876645694062127|
|      France| 9.874431909750196|
|     Germany| 9.859170923379173|
+------------+------------------+



Disease Trends

In [0]:
# Yearly trends in prevalence rate
df.groupBy("Year").agg(mean("Prevalence Rate (%)").alias("Avg Prevalence Rate")).orderBy("Year").show()


+----+-------------------+
|Year|Avg Prevalence Rate|
+----+-------------------+
|2000| 10.038363216449785|
|2001|  10.11728769801484|
|2002|  10.05502159099418|
|2003| 10.046103485214621|
|2004|  10.05270140641002|
|2005| 10.032667649042178|
|2006| 10.059378858699734|
|2007| 10.037947492260052|
|2008| 10.025843921509674|
|2009|  10.06484168766684|
|2010| 10.039032379329369|
|2011|   9.99733091239217|
|2012| 10.092937313582485|
|2013| 10.055976388083037|
|2014| 10.031369555908208|
|2015|  10.03066418171812|
|2016|  9.991070199071995|
|2017| 10.105064765538712|
|2018| 10.053803321069923|
|2019| 10.064259347255218|
+----+-------------------+
only showing top 20 rows



Gender Disparities

In [0]:
# Disease metrics by gender
df.groupBy("Gender").agg(
    mean("Prevalence Rate (%)").alias("Avg Prevalence Rate"),
    mean("Mortality Rate (%)").alias("Avg Mortality Rate"),
    mean("Recovery Rate (%)").alias("Avg Recovery Rate")
).show()


+------+-------------------+------------------+-----------------+
|Gender|Avg Prevalence Rate|Avg Mortality Rate|Avg Recovery Rate|
+------+-------------------+------------------+-----------------+
|Female| 10.051657448615535| 5.043412069394948|74.44995816615345|
| Other|  10.03810471898917| 5.053477203610858|74.50819661303923|
|  Male| 10.054201351011145| 5.052864605185814|74.53260222491379|
+------+-------------------+------------------+-----------------+



Top Performing Countries

In [0]:
# Rank countries by health outcomes
df.groupBy("Country").agg(
    mean("Prevalence Rate (%)").alias("Avg Prevalence"),
    mean("Mortality Rate (%)").alias("Avg Mortality"),
    mean("Recovery Rate (%)").alias("Avg Recovery")
).orderBy("Avg Recovery", ascending=False).show(10)


+------------+------------------+------------------+-----------------+
|     Country|    Avg Prevalence|     Avg Mortality|     Avg Recovery|
+------------+------------------+------------------+-----------------+
|   Indonesia| 10.03921396414502| 5.050056877562511|74.60450076372683|
|Saudi Arabia|10.015586492653831|5.0338063973738105|74.60434905320463|
|       Italy|10.057821384859242| 5.033974397560154|74.56533899155275|
|     Germany|10.063281648596908|5.0623638791454075|74.55059889190053|
|       China|10.004088603043984| 5.052352894179684|74.53306934846007|
|   Argentina|10.047871400457877| 5.043000120486759|74.52912084822685|
|      Canada|10.043381091112238| 5.039311170531193|74.51206050205518|
|      Mexico| 10.13214416932906| 5.060097843450479|74.50984824281149|
|         USA| 10.03176406948092|5.0353713461422895|74.50653957886749|
|   Australia|  9.99765179268513| 5.053099713730918|74.49263687866572|
+------------+------------------+------------------+-----------------+
only s

In [0]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

# Prepare data for clustering
assembler = VectorAssembler(inputCols=["Prevalence Rate (%)", "Mortality Rate (%)", "Recovery Rate (%)"], outputCol="features")
clustering_data = assembler.transform(df)

# Apply KMeans
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(clustering_data)
clusters = model.transform(clustering_data)

# Show clustering results
clusters.select("Country", "features", "prediction").show()


+------------+------------------+----------+
|     Country|          features|prediction|
+------------+------------------+----------+
|       Italy| [0.95,8.42,91.82]|         0|
|      France|[12.46,8.75,76.65]|         2|
|      Turkey| [0.91,6.22,98.55]|         0|
|   Indonesia| [4.68,3.99,67.35]|         2|
|       Italy| [0.83,7.01,50.06]|         1|
|Saudi Arabia|[10.99,4.64,93.17]|         0|
|         USA| [18.42,9.33,92.8]|         0|
|     Nigeria| [3.48,1.21,65.45]|         1|
|       Italy|[15.59,6.38,59.23]|         1|
|   Australia| [10.12,6.0,93.21]|         0|
|      Canada|[11.15,2.97,76.16]|         2|
|      Mexico| [3.88,1.85,63.54]|         1|
|     Nigeria| [8.92,5.52,68.02]|         2|
|      France|  [11.4,7.77,52.4]|         1|
|   Australia|[17.27,7.08,58.19]|         1|
|         USA| [1.82,7.01,87.75]|         0|
|     Nigeria| [15.18,8.21,50.8]|         1|
|   Australia|[13.39,4.04,58.67]|         1|
|      France| [14.44,2.21,54.8]|         1|
|   Indone