In [2]:
!unzip archive.zip

Archive:  archive.zip
  inflating: Latest Covid-19 India Status.csv  


In [None]:
import pandas as pd

In [None]:
dataset = pd.read_csv('Latest Covid-19 India Status.csv')

In [3]:
# Installing PySpark:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [4]:
# Installing PySpark:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [5]:
# Installing PySpark:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark

In [6]:
df = spark.read.csv('Latest Covid-19 India Status.csv', inferSchema=True,
                    header=True)

In [7]:
df

State/UTs,Total Cases,Active,Discharged,Deaths,Active Ratio,Discharge Ratio,Death Ratio,Population
Andaman and Nicobar,10034,0,9905,129,0.0,98.71,1.29,100896618
Andhra Pradesh,2319645,43,2304872,14730,0.0,99.36,0.64,128500364
Arunachal Pradesh,64495,8,64191,296,0.01,99.53,0.46,658019
Assam,724200,1349,716212,6639,0.19,98.9,0.92,290492
Bihar,830506,12,818238,12256,0.0,98.52,1.48,40100376
Chandigarh,91963,22,90776,1165,0.02,98.71,1.27,79502477
Chhattisgarh,1152217,12,1138171,14034,0.0,98.78,1.22,28900667
Dadra and Nagar H...,11441,0,11437,4,0.0,99.97,0.03,231502578
Delhi,1868550,1518,1840872,26160,0.08,98.52,1.4,773997
Goa,245359,17,241510,3832,0.01,98.43,1.56,3772103


In [8]:
from pyspark.sql import functions as F
renamed_df = df.select([F.col(col).alias(col.replace(' ','_')) for col in df.columns])

In [9]:
renamed_df

State/UTs,Total_Cases,Active,Discharged,Deaths,Active_Ratio,Discharge_Ratio,Death_Ratio,Population
Andaman and Nicobar,10034,0,9905,129,0.0,98.71,1.29,100896618
Andhra Pradesh,2319645,43,2304872,14730,0.0,99.36,0.64,128500364
Arunachal Pradesh,64495,8,64191,296,0.01,99.53,0.46,658019
Assam,724200,1349,716212,6639,0.19,98.9,0.92,290492
Bihar,830506,12,818238,12256,0.0,98.52,1.48,40100376
Chandigarh,91963,22,90776,1165,0.02,98.71,1.27,79502477
Chhattisgarh,1152217,12,1138171,14034,0.0,98.78,1.22,28900667
Dadra and Nagar H...,11441,0,11437,4,0.0,99.97,0.03,231502578
Delhi,1868550,1518,1840872,26160,0.08,98.52,1.4,773997
Goa,245359,17,241510,3832,0.01,98.43,1.56,3772103


In [10]:
renamed_df = renamed_df.withColumnRenamed('State/UTs','State_UTs')


In [11]:
renamed_df.createOrReplaceTempView('Data')

In [12]:
spark.sql('SELECT * FROM Data')

State_UTs,Total_Cases,Active,Discharged,Deaths,Active_Ratio,Discharge_Ratio,Death_Ratio,Population
Andaman and Nicobar,10034,0,9905,129,0.0,98.71,1.29,100896618
Andhra Pradesh,2319645,43,2304872,14730,0.0,99.36,0.64,128500364
Arunachal Pradesh,64495,8,64191,296,0.01,99.53,0.46,658019
Assam,724200,1349,716212,6639,0.19,98.9,0.92,290492
Bihar,830506,12,818238,12256,0.0,98.52,1.48,40100376
Chandigarh,91963,22,90776,1165,0.02,98.71,1.27,79502477
Chhattisgarh,1152217,12,1138171,14034,0.0,98.78,1.22,28900667
Dadra and Nagar H...,11441,0,11437,4,0.0,99.97,0.03,231502578
Delhi,1868550,1518,1840872,26160,0.08,98.52,1.4,773997
Goa,245359,17,241510,3832,0.01,98.43,1.56,3772103


In [13]:
spark.sql('SELECT COUNT(State_UTs) FROM Data')

count(State_UTs)
36


In [14]:
spark.sql('SELECT * FROM Data order by Active desc limit 5')

State_UTs,Total_Cases,Active,Discharged,Deaths,Active_Ratio,Discharge_Ratio,Death_Ratio,Population
Kerala,6537361,2466,6466280,68615,0.04,98.91,1.05,91702478
Delhi,1868550,1518,1840872,26160,0.08,98.52,1.4,773997
Karnataka,3946369,1506,3904806,40057,0.04,98.95,1.02,1711947
Assam,724200,1349,716212,6639,0.19,98.9,0.92,290492
Haryana,986996,911,975467,10618,0.09,98.83,1.08,14999397


In [15]:
spark.sql('SELECT * FROM Data ORDER BY Death_Ratio DESC LIMIT 5')

State_UTs,Total_Cases,Active,Discharged,Deaths,Active_Ratio,Discharge_Ratio,Death_Ratio,Population
Punjab,759255,57,741455,17743,0.01,97.66,2.34,34698876
Nagaland,35487,8,34720,759,0.02,97.84,2.14,38157311
Maharashtra,7875845,646,7727372,147827,0.01,98.11,1.88,399001
Uttarakhand,437322,384,429246,7692,0.09,98.15,1.76,85002417
Meghalaya,93787,4,92190,1593,0.0,98.3,1.7,30501026


In [16]:
spark.sql('SELECT SUM(Total_Cases),SUM(Deaths) FROM Data')

sum(Total_Cases),sum(Deaths)
43044280,521965


In [18]:
dataset = renamed_df.toPandas()

In [19]:
dataset.corr()

Unnamed: 0,Total_Cases,Active,Discharged,Deaths,Active_Ratio,Discharge_Ratio,Death_Ratio,Population
Total_Cases,1.0,0.629028,0.999987,0.942794,-0.036282,-0.122253,0.125231,-0.065787
Active,0.629028,1.0,0.630518,0.483329,0.468054,-0.031773,-0.017958,-0.120763
Discharged,0.999987,0.630518,1.0,0.941101,-0.036173,-0.120042,0.123041,-0.064887
Deaths,0.942794,0.483329,0.941101,1.0,-0.051543,-0.259307,0.261865,-0.118593
Active_Ratio,-0.036282,0.468054,-0.036173,-0.051543,1.0,0.063912,-0.170969,-0.221517
Discharge_Ratio,-0.122253,-0.031773,-0.120042,-0.259307,0.063912,1.0,-0.994163,0.222523
Death_Ratio,0.125231,-0.017958,0.123041,0.261865,-0.170969,-0.994163,1.0,-0.196086
Population,-0.065787,-0.120763,-0.064887,-0.118593,-0.221517,0.222523,-0.196086,1.0


In [20]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

In [23]:
fig = make_subplots(rows = 2,cols = 2,subplot_titles=['Total_Cases','Deaths','Active','Discharged'])
total_cases = go.Histogram(x = dataset['Total_Cases'],nbinsx = 20,name = 'Total_Cases')
deaths = go.Histogram(x = dataset['Deaths'],nbinsx = 20,name = 'Deaths')
active = go.Histogram(x = dataset['Active'],nbinsx = 20,name = 'Active')
discharged = go.Histogram(x = dataset['Discharged'],nbinsx = 20,name = 'Discharged')
fig.add_trace(total_cases,1,1)
fig.add_trace(deaths,1,2)
fig.add_trace(active,2,1)
fig.add_trace(discharged,2,2)
fig.update_layout(showlegend = False)
fig.show()

In [27]:
fig = make_subplots(rows = 2,cols = 2,subplot_titles = ['Total_Cases','Deaths','Active','Discharged'])
total_cases = go.Box(x = dataset['Total_Cases'],name = 'Total_Cases',text = dataset['State_UTs'])
deaths = go.Box(x = dataset['Deaths'],name = 'Total_Cases',text = dataset['State_UTs'])
active = go.Box(x = dataset['Active'],name = 'Active',text = dataset['State_UTs'])
discharged = go.Box(x = dataset['Discharged'],name = 'Dischraged',text = dataset['State_UTs'])
fig.add_trace(total_cases,1,1)
fig.add_trace(active,1,2)
fig.add_trace(active,2,1)
fig.add_trace(discharged,2,2)
fig.update_layout(showlegend = False)
fig.show()

In [28]:
fig = make_subplots(rows = 2,cols = 2,subplot_titles = ['Total_Cases','Deaths','Active','Discharged'])
total_cases = go.Bar(y = dataset['Total_Cases'],name = 'Total_Cases',hovertext = dataset['State_UTs'])
deaths = go.Bar(y = dataset['Deaths'],name = 'Total_Cases',hovertext = dataset['State_UTs'])
active = go.Bar(y = dataset['Active'],name = 'Active',hovertext = dataset['State_UTs'])
discharged = go.Bar(y = dataset['Discharged'],name = 'Dischraged',hovertext = dataset['State_UTs'])
fig.add_trace(total_cases,1,1)
fig.add_trace(active,1,2)
fig.add_trace(active,2,1)
fig.add_trace(discharged,2,2)
fig.update_layout(showlegend = False)
fig.show()

In [29]:
fig = go.Figure([go.Bar(y = dataset['Total_Cases'],hovertext = dataset['State_UTs']),go.Bar(y = dataset['Deaths'],hovertext = dataset['State_UTs'])])
fig.update_layout(barmode = 'group')
fig.update_layout(title = 'Total Cases VS Deaths')
fig.show()

In [30]:
fig = go.Figure([go.Bar(y = dataset['Total_Cases'],hovertext = dataset['State_UTs']),go.Bar(y = dataset['Discharged'],hovertext = dataset['State_UTs'])])
fig.update_layout(barmode = 'group')
fig.update_layout(title = 'Total Cases VS Deaths')
fig.show()

In [34]:
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=['Total Cases', 'Deaths', 'Active', 'Discharged'],
                    specs=[[{'type':'domain'}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}]])

total_cases = go.Pie(values=dataset['Total_Cases'], name='Total Cases', labels=dataset['State_UTs'])
deaths = go.Pie(values=dataset['Deaths'], name='Deaths', labels=dataset['State_UTs'])
active = go.Pie(values=dataset['Active'], name='Active', labels=dataset['State_UTs'])
discharged = go.Pie(values=dataset['Discharged'], name='Discharged', labels=dataset['State_UTs'])

fig.add_trace(total_cases, 1, 1)
fig.add_trace(deaths, 1, 2)
fig.add_trace(active, 2, 1)
fig.add_trace(discharged, 2, 2)

fig.update_traces(hoverinfo='percent+label')
fig.update_layout(showlegend=False)

fig.update_traces(textposition='inside')

fig = go.Figure(fig)
fig.show()

In [35]:
fig = go.Figure([go.Scatter(x = dataset['Total_Cases'],y = dataset['Deaths'],mode = 'markers',text = dataset['State_UTs'])])
fig.update_layout(title='Deaths vs Total Cases', xaxis_title='Total Cases', yaxis_title='Deaths')
fig.show()