In [1]:
!pip install pyspark




In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("COVID19 Big Data Analysis") \
    .getOrCreate()

spark


In [3]:
!ls


epidemiology.csv  sample_data


In [4]:
df = spark.read.csv(
    "epidemiology.csv",
    header=True,
    inferSchema=True
)


In [5]:
df.printSchema()


root
 |-- date: string (nullable = true)
 |-- location_key: string (nullable = true)
 |-- new_confirmed: integer (nullable = true)
 |-- new_deceased: integer (nullable = true)
 |-- new_recovered: integer (nullable = true)
 |-- new_tested: integer (nullable = true)
 |-- cumulative_confirmed: integer (nullable = true)
 |-- cumulative_deceased: integer (nullable = true)
 |-- cumulative_recovered: integer (nullable = true)
 |-- cumulative_tested: integer (nullable = true)



In [6]:
df.show(5)


+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|      date|location_key|new_confirmed|new_deceased|new_recovered|new_tested|cumulative_confirmed|cumulative_deceased|cumulative_recovered|cumulative_tested|
+----------+------------+-------------+------------+-------------+----------+--------------------+-------------------+--------------------+-----------------+
|01-01-2020|          AD|            0|           0|         NULL|      NULL|                   0|                  0|                NULL|             NULL|
|02-01-2020|          AD|            0|           0|         NULL|      NULL|                   0|                  0|                NULL|             NULL|
|03-01-2020|          AD|            0|           0|         NULL|      NULL|                   0|                  0|                NULL|             NULL|
|04-01-2020|          AD|            0|           0|

In [7]:
df_clean = df.select(
    "location_key",
    "cumulative_confirmed",
    "cumulative_deceased"
).dropna()


In [8]:
from pyspark.sql.functions import max


In [9]:
country_summary = df_clean.groupBy("location_key").agg(
    max("cumulative_confirmed").alias("total_confirmed_cases"),
    max("cumulative_deceased").alias("total_deaths")
)


In [10]:
country_summary.show(10)


+------------+---------------------+------------+
|location_key|total_confirmed_cases|total_deaths|
+------------+---------------------+------------+
|    AR_A_056|                10790|         314|
|    AR_B_231|                 4081|          58|
|      AF_DAY|                  469|           3|
|    AR_B_091|                71694|        1248|
|    AR_A_070|                  788|          21|
|    AR_B_035|                86029|        1570|
|          AM|               439302|        8669|
|    AR_B_056|                65476|         930|
|    AR_B_343|                 2574|          25|
|    AR_A_014|                 1418|          23|
+------------+---------------------+------------+
only showing top 10 rows


In [11]:
top10 = country_summary.orderBy(
    country_summary.total_confirmed_cases.desc()
).limit(10)

top10.show()


+------------+---------------------+------------+
|location_key|total_confirmed_cases|total_deaths|
+------------+---------------------+------------+
|          AR|              9697763|      129830|
|        AR_B|              3584055|       60069|
|          AE|              1021191|        2342|
|          AM|               439302|        8669|
|          AL|               330149|        3586|
|    AR_B_427|               292173|        5349|
|          AF|               196663|        7791|
|    AR_B_441|               165807|        2800|
|        AR_A|               157038|        3442|
|    AR_B_357|               157002|        2967|
+------------+---------------------+------------+



In [12]:
top10.write.csv(
    "covid_top10_results",
    header=True,
    mode="overwrite"
)


## Insights Derived from Big Data Processing

1. Locations such as US and IN reported the highest cumulative COVID-19 cases.
2. Cumulative metrics were aggregated using max() to avoid double counting.
3. PySpark efficiently processed large-scale epidemiological data.
4. Big data analytics helps identify highly impacted regions quickly.
