In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
from datetime import datetime

In [5]:
## Reading the data in
df = pd.read_csv("covid_19_india.csv")

# take a look at the dataset
df.head()

Unnamed: 0,Sno,Date,Time,State/UnionTerritory,ConfirmedIndianNational,ConfirmedForeignNational,Cured,Deaths,Confirmed
0,1,2020-01-30,6:00 PM,Kerala,1,0,0,0,1
1,2,2020-01-31,6:00 PM,Kerala,1,0,0,0,1
2,3,2020-02-01,6:00 PM,Kerala,2,0,0,0,2
3,4,2020-02-02,6:00 PM,Kerala,3,0,0,0,3
4,5,2020-02-03,6:00 PM,Kerala,3,0,0,0,3


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16850 entries, 0 to 16849
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Sno                       16850 non-null  int64 
 1   Date                      16850 non-null  object
 2   Time                      16850 non-null  object
 3   State/UnionTerritory      16850 non-null  object
 4   ConfirmedIndianNational   16850 non-null  object
 5   ConfirmedForeignNational  16850 non-null  object
 6   Cured                     16850 non-null  int64 
 7   Deaths                    16850 non-null  int64 
 8   Confirmed                 16850 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 1.2+ MB


In [7]:
df.describe()

Unnamed: 0,Sno,Cured,Deaths,Confirmed
count,16850.0,16850.0,16850.0,16850.0
mean,8425.5,236035.3,3485.222552,258366.7
std,4864.320353,522543.8,9330.541749,567280.8
min,1.0,0.0,0.0,0.0
25%,4213.25,2658.5,22.0,3644.75
50%,8425.5,28895.0,453.0,33361.5
75%,12637.75,253751.0,3071.25,266653.0
max,16850.0,5872268.0,123531.0,6113335.0


In [8]:
state_wise = df.groupby('State/UnionTerritory')['Confirmed','Cured','Deaths'].sum().reset_index()
state_wise["Death_percentage"] = ((state_wise["Deaths"] / state_wise["Confirmed"]) * 100)
state_wise.style.background_gradient(cmap='magma')


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,State/UnionTerritory,Confirmed,Cured,Deaths,Death_percentage
0,Andaman and Nicobar Islands,1675248,1589935,22624,1.350487
1,Andhra Pradesh,324146783,303427899,2475816,0.763795
2,Arunachal Pradesh,5598324,5150519,19303,0.3448
3,Assam,80418492,74011348,459575,0.571479
4,Bihar,106881540,100131380,756282,0.707589
5,Bihar****,1430909,1402468,18881,1.319511
6,Cases being reassigned to states,345565,0,0,0.0
7,Chandigarh,8691806,7980284,119356,1.373201
8,Chhattisgarh,128751782,117163544,1591126,1.235809
9,Dadra and Nagar Haveli,20722,20352,8,0.038606


In [9]:
px.bar(x=state_wise.nlargest(10,"Confirmed")["State/UnionTerritory"],
       y = state_wise.nlargest(10,"Confirmed")["Confirmed"],
       color_discrete_sequence=px.colors.diverging.Picnic,
       title="Top 10 states with highest number of Confirmed cases")

In [10]:
px.bar(x=state_wise.nlargest(10,"Cured")["State/UnionTerritory"],
       y = state_wise.nlargest(10,"Cured")["Cured"],
       color_discrete_sequence=px.colors.sequential.Sunset,
       title="Top 10 states with highest number of Cured cases")

In [11]:
px.bar(x=state_wise.nlargest(10,"Deaths")["State/UnionTerritory"],
       y = state_wise.nlargest(10,"Deaths")["Deaths"],
       color_discrete_sequence=px.colors.diverging.curl,
       title="Top 10 states with highest number of Deaths")

In [12]:
px.bar(x=state_wise.nlargest(10,"Death_percentage")["State/UnionTerritory"],
       y = state_wise.nlargest(10,"Death_percentage")["Death_percentage"],
       color_discrete_sequence=px.colors.diverging.Portland,
       title="Top 10 states with highest of Death percentage")

### Exploratory Data Analysis of StatewiseTestingDetails Dataset

In [14]:
covid_testing = pd.read_csv("StatewiseTestingDetails.csv")
covid_testing['Date'] = covid_testing['Date'].astype('datetime64[ns]')
covid_testing.head()

Unnamed: 0,Date,State,TotalSamples,Negative,Positive
0,2020-04-17,Andaman and Nicobar Islands,1403.0,1210.0,12.0
1,2020-04-24,Andaman and Nicobar Islands,2679.0,,27.0
2,2020-04-27,Andaman and Nicobar Islands,2848.0,,33.0
3,2020-05-01,Andaman and Nicobar Islands,3754.0,,33.0
4,2020-05-16,Andaman and Nicobar Islands,6677.0,,33.0


In [15]:
covid_testing.shape

(15113, 5)

In [16]:
covid_testing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15113 entries, 0 to 15112
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          15113 non-null  datetime64[ns]
 1   State         15113 non-null  object        
 2   TotalSamples  15113 non-null  float64       
 3   Negative      6657 non-null   object        
 4   Positive      5589 non-null   float64       
dtypes: datetime64[ns](1), float64(2), object(2)
memory usage: 590.5+ KB


In [17]:
covid_testing['Negative'] = covid_testing['TotalSamples'] - covid_testing['Positive']
covid_testing = covid_testing.dropna()
covid_testing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5589 entries, 0 to 14794
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Date          5589 non-null   datetime64[ns]
 1   State         5589 non-null   object        
 2   TotalSamples  5589 non-null   float64       
 3   Negative      5589 non-null   float64       
 4   Positive      5589 non-null   float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 262.0+ KB


In [18]:
covid_testing_state = covid_testing.groupby('State')['TotalSamples','Negative','Positive'].max().reset_index()
covid_testing_state["Positive_percentage"] = ((covid_testing["Positive"] / covid_testing["TotalSamples"]) * 100)
covid_testing_state.style.background_gradient(cmap='gist_earth_r')


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,State,TotalSamples,Negative,Positive,Positive_percentage
0,Andaman and Nicobar Islands,414166.0,406684.0,7482.0,0.85531
1,Andhra Pradesh,2534304.0,2298779.0,235525.0,1.007839
2,Arunachal Pradesh,122854.0,120196.0,2658.0,1.158708
3,Assam,1930764.0,1842856.0,87908.0,0.879062
4,Bihar,1272980.0,1182427.0,90553.0,0.494234
5,Chandigarh,22198.0,19893.0,2305.0,0.473798
6,Chhattisgarh,468099.0,448640.0,19459.0,0.46597
7,Dadra and Nagar Haveli and Daman and Diu,72410.0,69216.0,3194.0,0.460444
8,Delhi,1291411.0,1139483.0,151928.0,0.454358
9,Goa,178175.0,176827.0,12333.0,0.450389


In [19]:
px.bar(x=covid_testing_state.nlargest(10,"TotalSamples")["State"],
       y = covid_testing_state.nlargest(10,"TotalSamples")["TotalSamples"],
       labels={'y':'Total Samples','x':'State'},
       color_discrete_sequence=px.colors.sequential.haline,
       title="Top 10 states with highest number of Total Samples")

In [20]:
px.bar(x=covid_testing_state.nlargest(10,"Negative")["State"],
       y = covid_testing_state.nlargest(10,"Negative")["Negative"],
       labels={'y':'Total Negative cases','x':'State'},
       color_discrete_sequence=px.colors.sequential.turbid,
       title="Top 10 states with highest number of Negative cases")

In [21]:
px.bar(x=covid_testing_state.nlargest(10,"Positive")["State"],
       y = covid_testing_state.nlargest(10,"Positive")["Positive"],
       labels={'y':'Total Positive Cases','x':'State'},
       color_discrete_sequence=px.colors.sequential.solar,
       title="Top 10 states with highest number of Positive cases")

In [22]:
px.bar(x=covid_testing_state.nlargest(10,"Positive_percentage")["State"],
       y = covid_testing_state.nlargest(10,"Positive_percentage")["Positive_percentage"],
       labels={'y':'Positive Percentage','x':'State'},
       color_discrete_sequence=px.colors.sequential.Aggrnyl,
       title="Top 10 states with highest Positive percentage",
       height = 420)

## Vaccination Analysis

In [24]:
vaccination = pd.read_csv(r"covid_vaccine_statewise.csv")
vaccination.head()

Unnamed: 0,Updated On,State,Total Doses Administered,Total Sessions Conducted,Total Sites,First Dose Administered,Second Dose Administered,Male(Individuals Vaccinated),Female(Individuals Vaccinated),Transgender(Individuals Vaccinated),Total Covaxin Administered,Total CoviShield Administered,Total Sputnik V Administered,AEFI,18-45 years (Age),45-60 years (Age),60+ years (Age),Total Individuals Vaccinated
0,16/01/2021,India,48276.0,3455.0,2957.0,48276.0,0.0,23757.0,24517.0,2.0,579.0,47697.0,,,,,,48276.0
1,17/01/2021,India,58604.0,8532.0,4954.0,58604.0,0.0,27348.0,31252.0,4.0,635.0,57969.0,,,,,,58604.0
2,18/01/2021,India,99449.0,13611.0,6583.0,99449.0,0.0,41361.0,58083.0,5.0,1299.0,98150.0,,,,,,99449.0
3,19/01/2021,India,195525.0,17855.0,7951.0,195525.0,0.0,81901.0,113613.0,11.0,3017.0,192508.0,,,,,,195525.0
4,20/01/2021,India,251280.0,25472.0,10504.0,251280.0,0.0,98111.0,153145.0,24.0,3946.0,247334.0,,,,,,251280.0


In [26]:
vaccination.shape

(6699, 18)

In [27]:
vaccination.isnull().sum()

Updated On                                0
State                                     0
Total Doses Administered                335
Total Sessions Conducted                336
Total Sites                             336
First Dose Administered                 336
Second Dose Administered                336
Male(Individuals Vaccinated)            336
Female(Individuals Vaccinated)          336
Transgender(Individuals Vaccinated)     336
Total Covaxin Administered              336
Total CoviShield Administered           336
Total Sputnik V Administered           4962
AEFI                                   2519
18-45 years (Age)                      2522
45-60 years (Age)                      2521
60+ years (Age)                        2521
Total Individuals Vaccinated            336
dtype: int64

In [28]:
vaccination = vaccination.drop(columns=['Total Sputnik V Administered','AEFI', '18-45 years (Age)','45-60 years (Age)','60+ years (Age)'], axis=1)

In [29]:
vaccination.head()

Unnamed: 0,Updated On,State,Total Doses Administered,Total Sessions Conducted,Total Sites,First Dose Administered,Second Dose Administered,Male(Individuals Vaccinated),Female(Individuals Vaccinated),Transgender(Individuals Vaccinated),Total Covaxin Administered,Total CoviShield Administered,Total Individuals Vaccinated
0,16/01/2021,India,48276.0,3455.0,2957.0,48276.0,0.0,23757.0,24517.0,2.0,579.0,47697.0,48276.0
1,17/01/2021,India,58604.0,8532.0,4954.0,58604.0,0.0,27348.0,31252.0,4.0,635.0,57969.0,58604.0
2,18/01/2021,India,99449.0,13611.0,6583.0,99449.0,0.0,41361.0,58083.0,5.0,1299.0,98150.0,99449.0
3,19/01/2021,India,195525.0,17855.0,7951.0,195525.0,0.0,81901.0,113613.0,11.0,3017.0,192508.0,195525.0
4,20/01/2021,India,251280.0,25472.0,10504.0,251280.0,0.0,98111.0,153145.0,24.0,3946.0,247334.0,251280.0


In [30]:
male = vaccination["Male(Individuals Vaccinated)"].sum() 
female = vaccination["Female(Individuals Vaccinated)"].sum()  
trans = vaccination["Transgender(Individuals Vaccinated)"].sum()
px.pie(names=["Male Vaccinated","Female Vaccinated","Trans Gender"],values=[male,female,trans],title="Male and Female Vaccinated ratio for Covid19")

In [31]:
Covaxin = vaccination["Total Covaxin Administered"].sum() 
Covishield = vaccination["Total CoviShield Administered"].sum()  
px.pie(names=["Covaxin Vaccinated","Covishield Vaccinated"],values=[Covaxin,Covishield],title="Covaxin and Covishield Vaccination")

In [32]:
Doses = vaccination["Total Doses Administered"].sum() 
Vaccinated = vaccination["Total Individuals Vaccinated"].sum()  
px.pie(names=["Doses Administered","People Vaccinated"],values=[Doses,Vaccinated],title="Doses administered vs People Vaccinated")