In [48]:
import findspark 

findspark.init()
findspark.find()

'c:\\Python312\\Lib\\site-packages\\pyspark'

In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import types 

In [2]:
from pyspark.sql.window import Window

In [3]:
# creating a session
spark = SparkSession.builder.appName('CovidCaseAnalysis').getOrCreate()

In [4]:
spark_context = spark.sparkContext

In [5]:
spark_context

In [6]:
#load the csv file, enabling header and datatypes with it
df_coviddata = spark.read.csv('covid_cases_dataset.csv', header=True, inferSchema=True)
# df_coviddata2 = spark.read.option('header', 'true').csv('covid_cases_dataset.csv')

In [7]:
#to see the structure of the dataset
df_coviddata.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Total Confirmed cases: integer (nullable = true)
 |-- Death: string (nullable = true)
 |-- Cured/Discharged/Migrated: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)



In [8]:
df_coviddata = df_coviddata.withColumn("Death", df_coviddata["Death"].cast(types.IntegerType()))

In [9]:
df_coviddata.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Latitude: double (nullable = true)
 |-- Longitude: double (nullable = true)
 |-- Total Confirmed cases: integer (nullable = true)
 |-- Death: integer (nullable = true)
 |-- Cured/Discharged/Migrated: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)



In [10]:
numeric_cols = ['Total Confirmed cases','Death','Cured/Discharged/Migrated','New cases','New deaths','New recovered']
df_coviddata_f = df_coviddata.fillna(0, subset=numeric_cols)

In [11]:
string_cols = ['Date','Name of State / UT']
df_coviddata_f = df_coviddata_f.dropna(subset=string_cols)

In [12]:
df_coviddata_f = df_coviddata.drop(F.col('Latitude'))
df_coviddata_f = df_coviddata_f.drop(F.col('Longitude'))
df_coviddata_f.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Total Confirmed cases: integer (nullable = true)
 |-- Death: integer (nullable = true)
 |-- Cured/Discharged/Migrated: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)



In [62]:
df_coviddata_f.show()

+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|      Date|Name of State / UT|Latitude|Longitude|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|
+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+
|30-01-2020|            Kerala| 10.8505|  76.2711|                    1|    0|                        0|        0|         0|            0|
|31-01-2020|            Kerala| 10.8505|  76.2711|                    1|    0|                        0|        0|         0|            0|
|01-02-2020|            Kerala| 10.8505|  76.2711|                    2|    0|                        0|        1|         0|            0|
|02-02-2020|            Kerala| 10.8505|  76.2711|                    3|    0|                        0|        1|         0|            0|
|03-02-2020|        

In [100]:
df_coviddata = (df_coviddata
    .withColumn("State", df_coviddata["Name of State / UT"])
    .withColumn("Cured", df_coviddata["Cured/Discharged/Migrated"])
)

In [101]:
df_coviddata.show()

+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+------+-----+
|      Date|Name of State / UT|Latitude|Longitude|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered| State|Cured|
+----------+------------------+--------+---------+---------------------+-----+-------------------------+---------+----------+-------------+------+-----+
|30-01-2020|            Kerala| 10.8505|  76.2711|                    1|    0|                        0|        0|         0|            0|Kerala|    0|
|31-01-2020|            Kerala| 10.8505|  76.2711|                    1|    0|                        0|        0|         0|            0|Kerala|    0|
|01-02-2020|            Kerala| 10.8505|  76.2711|                    2|    0|                        0|        1|         0|            0|Kerala|    0|
|02-02-2020|            Kerala| 10.8505|  76.2711|                    3|    0|    

In [102]:
# to get a specific colum
df_coviddata.select('Cured') # return a dataframe

DataFrame[Cured: int]

In [103]:
df_coviddata.select('Cured').show()

+-----+
|Cured|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 20 rows



In [104]:
#to select multiple columns
df_coviddata.select(['Total Confirmed cases','Cured']).show()

+---------------------+-----+
|Total Confirmed cases|Cured|
+---------------------+-----+
|                    1|    0|
|                    1|    0|
|                    2|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
|                    3|    0|
+---------------------+-----+
only showing top 20 rows



In [105]:
#to drop a column
df_coviddata = df_coviddata.drop('Latitude')
df_coviddata = df_coviddata.drop('Longitude')

In [106]:
df_coviddata.show()

+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+------+-----+
|      Date|Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered| State|Cured|
+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+------+-----+
|30-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|Kerala|    0|
|31-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|Kerala|    0|
|01-02-2020|            Kerala|                    2|    0|                        0|        1|         0|            0|Kerala|    0|
|02-02-2020|            Kerala|                    3|    0|                        0|        1|         0|            0|Kerala|    0|
|03-02-2020|            Kerala|                    3|    0|   

In [107]:
#to rename the column
df_coviddata = df_coviddata.withColumnRenamed('oldName', 'newName')

In [108]:
#drop all nullable rows
df_coviddata.na.drop()

DataFrame[Date: string, Name of State / UT: string, Total Confirmed cases: int, Death: string, Cured/Discharged/Migrated: int, New cases: int, New deaths: int, New recovered: int, State: string, Cured: int]

Q1. To Convert all state names to LOWERCASE 

In [13]:
states_in_lowercase = df_coviddata_f.select(F.lower(F.col('Name of State / UT')).alias("State_InLowerCase"))
states_in_lowercase.distinct().show()

+--------------------+
|   State_InLowerCase|
+--------------------+
|               delhi|
|         maharashtra|
|           meghalaya|
|              odisha|
|             haryana|
|         west bengal|
|                 goa|
|              punjab|
|   jammu and kashmir|
|dadra and nagar h...|
|           karnataka|
|      andhra pradesh|
|           telangana|
|            nagaland|
|               bihar|
|      madhya pradesh|
|           jharkhand|
|               assam|
|              kerala|
|          tamil nadu|
+--------------------+
only showing top 20 rows



Day with high covid cases

In [14]:
df_coviddata_f = df_coviddata_f.withColumn("TotalCases", F.col('Total Confirmed cases')+F.col('New cases')-F.col('New recovered')-F.col('New deaths'))
df_coviddata_f.show()

+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+
|      Date|Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|TotalCases|
+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+
|30-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|
|31-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|
|01-02-2020|            Kerala|                    2|    0|                        0|        1|         0|            0|         3|
|02-02-2020|            Kerala|                    3|    0|                        0|        1|         0|            0|         4|
|03-02-2020|            Kerala|                    3|    0|                 

In [15]:
total_case_by_date = df_coviddata_f.groupBy('Date').agg(F.sum('TotalCases').alias('TotalCase'))
total_case_by_date

DataFrame[Date: string, TotalCase: bigint]

In [16]:
max_covid_case = total_case_by_date.orderBy(F.col('TotalCase').desc()).limit(1)
max_covid_case.show()

+----------+---------+
|      Date|TotalCase|
+----------+---------+
|06-08-2020|  1974697|
+----------+---------+



State that has second largest covid test cases

In [17]:
total_case_by_state = df_coviddata_f.groupBy('Name of State / UT').agg(F.sum('TotalCases').alias('TotalCase'))
total_case_by_state.show(truncate=False)

+----------------------------------------+---------+
|Name of State / UT                      |TotalCase|
+----------------------------------------+---------+
|Nagaland                                |46816    |
|Karnataka                               |2810670  |
|Odisha                                  |845046   |
|Kerala                                  |608375   |
|Ladakh                                  |57628    |
|Dadra and Nagar Haveli and Daman and Diu|26614    |
|Tamil Nadu                              |7905727  |
|Telengana                               |105585   |
|Chhattisgarh                            |259124   |
|Andhra Pradesh                          |2824160  |
|Madhya Pradesh                          |1301151  |
|Punjab                                  |546880   |
|Manipur                                 |85231    |
|Goa                                     |152932   |
|Mizoram                                 |13585    |
|Himachal Pradesh                        |8170

In [18]:
window_spec = Window.orderBy(F.col('TotalCase').desc())
ranked_covid_cases = total_case_by_state.withColumn('rank', F.row_number().over(window_spec))
second_largest_df = ranked_covid_cases.filter(F.col('rank') == 2)

In [19]:
second_largest_df.drop('rank').show()

+------------------+---------+
|Name of State / UT|TotalCase|
+------------------+---------+
|        Tamil Nadu|  7905727|
+------------------+---------+



Which union teritory has least number of deaths

In [20]:
ut_df = df_coviddata_f.filter(F.col('Name of State / UT').like('Union Territory of%'))
ut_df.show()

+----------+--------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+
|      Date|  Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|TotalCases|
+----------+--------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+
|07-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|         2|
|08-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|         2|
|09-03-2020|Union Territory o...|                    1|    0|                        0|        0|         0|            0|         1|
|09-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|         2|
|10-03-2020|Union Territory o...|                    1|    0| 

In [21]:
ut_df = ut_df.withColumn("TotalDeaths", F.col('Death')+F.col('New deaths'))
ut_df.show()

+----------+--------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+
|      Date|  Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|TotalCases|TotalDeaths|
+----------+--------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+
|07-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|         2|          0|
|08-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|         2|          0|
|09-03-2020|Union Territory o...|                    1|    0|                        0|        0|         0|            0|         1|          0|
|09-03-2020|Union Territory o...|                    2|    0|                        0|        0|         0|            0|  

In [22]:
ut_covid_death_cases = ut_df.groupBy('Name of State / UT').agg(F.sum('TotalDeaths').alias('TotalDeaths'))
ut_with_least_death = ut_covid_death_cases.orderBy(F.col('TotalDeaths')).limit(1)
ut_with_least_death.show(truncate=False)

+------------------------------------+-----------+
|Name of State / UT                  |TotalDeaths|
+------------------------------------+-----------+
|Union Territory of Jammu and Kashmir|0          |
+------------------------------------+-----------+



State with lowest death to confirmed case ratio

In [23]:
df_coviddata_f = df_coviddata_f.withColumn("TotalDeaths", F.col('Death')+F.col('New deaths'))
df_coviddata_f.show()

+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+
|      Date|Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|TotalCases|TotalDeaths|
+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+
|30-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|          0|
|31-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|          0|
|01-02-2020|            Kerala|                    2|    0|                        0|        1|         0|            0|         3|          0|
|02-02-2020|            Kerala|                    3|    0|                        0|        1|         0|            0|         4|     

In [24]:
df_coviddata_f = df_coviddata_f.withColumn("DeathToConfirmedCaseRatio", F.col('TotalDeaths')/F.col('TotalCases'))
df_coviddata_f.show()

+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+-------------------------+
|      Date|Name of State / UT|Total Confirmed cases|Death|Cured/Discharged/Migrated|New cases|New deaths|New recovered|TotalCases|TotalDeaths|DeathToConfirmedCaseRatio|
+----------+------------------+---------------------+-----+-------------------------+---------+----------+-------------+----------+-----------+-------------------------+
|30-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|          0|                      0.0|
|31-01-2020|            Kerala|                    1|    0|                        0|        0|         0|            0|         1|          0|                      0.0|
|01-02-2020|            Kerala|                    2|    0|                        0|        1|         0|            0|         3|          0|       

In [25]:
state_lowest_deathToConfirmedCase_ratio = df_coviddata_f.groupBy('Name of State / UT').agg(F.avg('DeathToConfirmedCaseRatio').alias('Ratio'))
lowest_ratio_df = state_lowest_deathToConfirmedCase_ratio.orderBy(F.col('Ratio')).limit(1)
lowest_ratio_df.show()

+------------------+-----+
|Name of State / UT|Ratio|
+------------------+-----+
|           Mizoram|  0.0|
+------------------+-----+



In [26]:
import calendar
from datetime import datetime


def get_month_num(date):
    return datetime.strptime(date, "%d-%m-%Y").month

def get_month_name(month_num):
    return calendar.month_name[int(month_num)]

get_month_num_udf = F.udf(lambda d : get_month_num(d), types.StringType())
get_month_name_udf = F.udf(lambda m : get_month_name(m), types.StringType())

In [27]:
df_coviddata_f = df_coviddata_f.withColumn("month", get_month_num_udf(F.col('Date')))

In [28]:
df_coviddata_f.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Name of State / UT: string (nullable = true)
 |-- Total Confirmed cases: integer (nullable = true)
 |-- Death: integer (nullable = true)
 |-- Cured/Discharged/Migrated: integer (nullable = true)
 |-- New cases: integer (nullable = true)
 |-- New deaths: integer (nullable = true)
 |-- New recovered: integer (nullable = true)
 |-- TotalCases: integer (nullable = true)
 |-- TotalDeaths: integer (nullable = true)
 |-- DeathToConfirmedCaseRatio: double (nullable = true)
 |-- month: string (nullable = true)



In [29]:
df_coviddata_f.head(4)

Py4JJavaError: An error occurred while calling o184.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 25.0 failed 1 times, most recent failure: Lost task 0.0 in stage 25.0 (TID 19) (853CBX3 executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:54)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 27 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.Dataset.$anonfun$collectToPython$1(Dataset.scala:4150)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4324)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4322)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4322)
	at org.apache.spark.sql.Dataset.collectToPython(Dataset.scala:4147)
	at java.base/jdk.internal.reflect.DirectMethodHandleAccessor.invoke(DirectMethodHandleAccessor.java:103)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:54)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 27 more


In [132]:
monthly_recovery_df = df_coviddata.groupBy("month").agg(F.sum("New recovered").alias("total_recovered_cases_by_month"))
monthly_recovery_df = monthly_recovery_df.orderBy(F.col('total_recovered_cases_by_month').desc()).limit(1)
monthly_recovery_df = monthly_recovery_df.withcolumn('month',get_month_name_udf(F.month))
monthly_recovery_df.show()

Py4JJavaError: An error occurred while calling o1136.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 112.0 failed 1 times, most recent failure: Lost task 0.0 in stage 112.0 (TID 90) (Rokesh executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:54)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 29 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:203)
	at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
	at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
	at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:174)
	at org.apache.spark.sql.execution.python.BatchEvalPythonExec.evaluate(BatchEvalPythonExec.scala:54)
	at org.apache.spark.sql.execution.python.EvalPythonExec.$anonfun$doExecute$2(EvalPythonExec.scala:131)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2(RDD.scala:858)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitions$2$adapted(RDD.scala:858)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1570)
Caused by: java.net.SocketTimeoutException: Accept timed out
	at java.base/sun.nio.ch.NioSocketImpl.timedAccept(NioSocketImpl.java:701)
	at java.base/sun.nio.ch.NioSocketImpl.accept(NioSocketImpl.java:745)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:695)
	at java.base/java.net.ServerSocket.platformImplAccept(ServerSocket.java:660)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:636)
	at java.base/java.net.ServerSocket.implAccept(ServerSocket.java:582)
	at java.base/java.net.ServerSocket.accept(ServerSocket.java:541)
	at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:190)
	... 29 more
