# Covid Data

In [1]:
# Importing modules

# Keeping only required Columns
# cols = [df1.columns.sort_values(), df2.columns.sort_values(), 
#         df3.columns.sort_values(), df4.columns.sort_values(), df5.columns.sort_values(), df6.columns.sort_values()]
# cols = list(set(cols[0]).intersection(*cols))

import pandas as pd
import numpy as np
import json
from datetime import datetime
import plotly.graph_objects as go
import pickle

In [2]:
# Arranging old formatted data : df1, df2
# dr = pd.json_normalize(pd.read_json('https://api.covid19india.org/deaths_recoveries.json')['deaths_recoveries'])
# dr = dr[['date', 'gender', 'statecode', 'state', 'district', 'city', 'nationality', 'patientstatus']].rename(
#         columns={'date':'dateannounced', 'state':'detectedstate', 'district':'detecteddistrict', 
#                  'city':'detectedcity', 'patientstatus':'currentstatus'}
#     )
# dr['numcases'] = 1
# dr = dr[cols]

# prev_data = pd.concat([df1[cols], df2[cols], dr[cols]])
# prev_data['numcases'] = 1
# prev_data.shape

In [20]:
# Functions
def url(i):
    return "https://api.covid19india.org/raw_data"+str(i)+".json"

def get_df(i):
    return pd.json_normalize(pd.read_json(url(i))['raw_data'])

def make_int(x):
    try:
        return int(x)
    except:
        return 0

def save_data(data):
    with open('data/data.pkl', 'wb') as f:
        pickle.dump(data, f)
    
# Loading already saved data
with open('data/data.pkl', 'rb') as f:
    data = pickle.load(f)
    
cols = ['dateannounced', 'numcases', 'statecode', 'detectedstate', 'detecteddistrict', 'detectedcity', 'nationality', 'currentstatus']

In [21]:
# Get latest, data has empty rows at the end, Removing them
live_df = get_df(7)

live_df.replace("", float("NaN"), inplace=True)
live_df = live_df[cols].dropna(subset=['dateannounced', 'numcases'])
live_df['dateannounced'] = live_df['dateannounced'].apply(lambda x : datetime.strptime(x, "%d/%m/%Y"))

# Merge Live data with old data and arrange
data = pd.concat([data, live_df[cols]])

data.index = range(data.shape[0])
data = data.sort_values(by='dateannounced')
data = data[data['currentstatus'].isin(['Hospitalized', 'Recovered', 'Deceased'])]
data['numcases'] = data['numcases'].apply(lambda x:make_int(x))
data.shape

(118032, 8)

In [22]:
# Separating active, recovered, deceased cases
h_data = data[data['currentstatus']=='Hospitalized']
r_data = data[data['currentstatus']=='Recovered']
d_data = data[data['currentstatus']=='Deceased']

## Daily cases Total

In [149]:
# Datewise number and cumsum
def date_wise(df, group_by_col='dateannounced', num_case='numcases'):
    h_num = df.groupby(by=group_by_col)[[num_case]].sum()
    h_num['total_cases'] = h_num[num_case].cumsum()
    return h_num

h_num = date_wise(h_data)
r_num = date_wise(r_data)
d_num = date_wise(d_data)
h_num.shape, r_num.shape, d_num.shape

((112, 2), (117, 2), (100, 2))

In [266]:
# Plot daily case

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=h_num.index,
        y=h_num.numcases,
        name='Active Cases'
    ))
fig.add_trace(
    go.Scatter(
        x=r_num.index,
        y=r_num.numcases,
        name='Recovered Cases'
    ))
fig.add_trace(
    go.Scatter(
        x=d_num.index,
        y=d_num.numcases,
        name='Deceased Cases'
    ))
fig.update_layout(title="Daily Cases of COVID")
fig.show()
fig.write_html("plots/daily_cases_total.html")

In [28]:
# Plot Cumulative case

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=h_num.index,
        y=h_num.total_cases,
        name='Active Cases'
    ))
fig.add_trace(
    go.Scatter(
        x=r_num.index,
        y=r_num.total_cases,
        name='Recovered Cases'
    ))
fig.add_trace(
    go.Scatter(
        x=d_num.index,
        y=d_num.total_cases,
        name='Deceased Cases'
    ))
fig.update_layout(title="Cumulative Cases of COVID")
fig.show()
fig.write_html("plots/cumulative_cases_total.html")

## Statewise

In [52]:
state_date_data = h_data.groupby(by=['statecode', 'dateannounced'])[['numcases']].sum()
state_time = state_date_data.loc['OR']
state_time['total_cases'] = state_time['numcases'].cumsum()
state_time

Unnamed: 0_level_0,numcases,total_cases
dateannounced,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-16,1,1
2020-03-19,1,2
2020-03-26,1,3
2020-03-31,1,4
2020-04-01,1,5
...,...,...
2020-06-21,304,5159
2020-06-22,143,5302
2020-06-23,167,5469
2020-06-24,282,5751


In [30]:
def add_active(full_data,  num_case='numcases'):
    full_data = full_data.rename(columns={num_case:'Confirmed'})
    full_data.replace(float("NaN"), 0, inplace=True)

    full_data['Recovered'] = full_data['Recovered'].apply(int)
    full_data['Deceased'] = full_data['Deceased'].apply(int)

    full_data['Active'] = full_data['Confirmed'] - full_data['Recovered'] - full_data['Deceased']
    return full_data

def combine_data(group_by, num_case='numcases'):
    full_data = h_data.groupby(by=group_by)[[num_case]].sum()
    full_data['Recovered'] = r_data.groupby(by=group_by)[[num_case]].sum()
    full_data['Deceased'] = d_data.groupby(by=group_by)[[num_case]].sum()

    return add_active(full_data, num_case)

state_data = combine_data('detectedstate')
state_data[1:]

Unnamed: 0_level_0,Confirmed,Recovered,Deceased,Active
detectedstate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Andaman and Nicobar Islands,47,52,0,-5
Andhra Pradesh,10888,4990,136,5762
Arunachal Pradesh,172,42,1,129
Assam,6647,4034,9,2604
Bihar,8487,6480,57,1950
Chandigarh,423,329,6,88
Chhattisgarh,2448,1737,12,699
Dadra and Nagar Haveli and Daman and Diu,145,32,0,113
Delhi,73776,44767,2430,26579
Goa,995,335,2,658


### Statistics based on population

## District Wise

In [31]:
h_dist = h_data.groupby(by=['statecode', 'detecteddistrict'])[['numcases']].sum()
r_dist = r_data.groupby(by=['statecode', 'detecteddistrict'])[['numcases']].sum()
d_dist = d_data.groupby(by=['statecode', 'detecteddistrict'])[['numcases']].sum()

In [33]:
def dist_data(h_dist, r_dist, d_dist, statecode, num_case='numcases'):
    dist_data = h_dist.loc[statecode]
    dist_data['Recovered'] = r_dist.loc[statecode]
    dist_data['Deceased'] = d_dist.loc[statecode]
    
    dist_data = add_active(dist_data, num_case)
    return dist_data

district_data = dist_data(h_dist, r_dist, d_dist, 'OR', 'numcases')
district_data

Unnamed: 0_level_0,Confirmed,Recovered,Deceased,Active
detecteddistrict,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Angul,58,47,0,11
Balangir,165,143,0,22
Balasore,325,222,1,102
Bargarh,68,47,1,20
Bhadrak,200,182,0,18
Boudh,40,38,0,2
Cuttack,496,404,4,88
Deogarh,42,40,0,2
Dhenkanal,73,53,0,20
Gajapati,384,121,0,263


# Testing

## India Testing

In [143]:
list(zip(icmr_test['Total Samples Tested'].tolist(), icmr_test.daily_test.tolist()))

[(6500.0, 6500.0),
 (13125.0, 6625.0),
 (13316.0, 191.0),
 (14175.0, 859.0),
 (14376.0, 201.0),
 (15404.0, 1028.0),
 (15701.0, 297.0),
 (16911.0, 1210.0),
 (16999.0, 88.0),
 (18127.0, 1128.0),
 (18383.0, 256.0),
 (20707.0, 2324.0),
 (22694.0, 1987.0),
 (20864.0, -1830.0),
 (22928.0, 2064.0),
 (25144.0, 2216.0),
 (27688.0, 2544.0),
 (38442.0, 10754.0),
 (42788.0, 4346.0),
 (47951.0, 5163.0),
 (55851.0, 7900.0),
 (56680.0, 829.0),
 (69245.0, 12565.0),
 (79950.0, 10705.0),
 (89534.0, 9584.0),
 (101068.0, 11534.0),
 (114015.0, 12947.0),
 (127919.0, 13904.0),
 (144910.0, 16991.0),
 (161330.0, 16420.0),
 (179374.0, 18044.0),
 (195748.0, 16374.0),
 (217554.0, 21806.0),
 (244893.0, 27339.0),
 (274599.0, 29706.0),
 (302956.0, 28357.0),
 (335123.0, 32167.0),
 (372123.0, 37000.0),
 (401586.0, 29463.0),
 (462621.0, 61035.0),
 (500542.0, 37921.0),
 (541789.0, 41247.0),
 (579957.0, 38168.0),
 (625309.0, 45352.0),
 (665819.0, 40510.0),
 (716733.0, 50914.0),
 (770764.0, 54031.0),
 (830201.0, 59437.0),

In [142]:
icmr_test =pd.read_csv('https://api.covid19india.org/csv/latest/tested_numbers_icmr_data.csv')[['Tested As Of',
                                                                                'Total Samples Tested']]
icmr_test['Tested As Of'] = icmr_test['Tested As Of'].apply(str)
icmr_test.replace("nan", float("NaN"), inplace=True)
icmr_test = icmr_test.dropna(subset=['Tested As Of', 'Total Samples Tested'])
icmr_test['Tested As Of'] = icmr_test['Tested As Of'].apply(lambda x : datetime.strptime(x, "%d/%m/%Y"))
icmr_test = icmr_test.sort_values(by='Tested As Of')[['Tested As Of', 'Total Samples Tested']]
icmr_test.index = icmr_test['Tested As Of']
icmr_test.index.name = 'dateannounced'
del icmr_test['Tested As Of']
icmr_test['daily_test'] = icmr_test['Total Samples Tested'].diff().fillna(
                                            icmr_test['Total Samples Tested'])
icmr_test.head()

Unnamed: 0_level_0,Total Samples Tested,daily_test
dateannounced,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-03-13,6500.0,6500.0
2020-03-18,13125.0,6625.0
2020-03-19,13316.0,191.0
2020-03-19,14175.0,859.0
2020-03-20,14376.0,201.0


In [38]:
# Plot daily case

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=icmr_test['Tested As Of'],
        y=icmr_test['Total Samples Tested'],
        name='Testing per day'
    ))

fig.update_layout(title="Daily Testing of COVID Cases in India")
fig.show()
fig.write_html("plots/daily_testing_total.html")

## State Wise Testing

In [39]:
state_test = pd.read_csv('https://api.covid19india.org/csv/latest/statewise_tested_numbers_data.csv')
state_test['Updated On'] = state_test['Updated On'].apply(lambda x : datetime.strptime(x, "%d/%m/%Y"))
state_test.head()

Unnamed: 0,Updated On,State,Total Tested,Tag (Total Tested),Positive,Negative,Unconfirmed,Cumulative People In Quarantine,Total People Currently in Quarantine,Tag (People in Quarantine),...,Num Calls State Helpline,Source1,Unnamed: 21,Source2,Unnamed: 23,Test positivity rate,Tests per thousand,Tests per million,Tests per positive case,Population NCP 2019 Projection
0,2020-04-17,Andaman and Nicobar Islands,1403.0,Samples Sent,12.0,1210.0,181.0,,,,...,,https://t.me/indiacovid/2550,,,,0.86%,3.53,3534.0,117.0,397000.0
1,2020-04-24,Andaman and Nicobar Islands,2679.0,Samples Sent,27.0,,246.0,,614.0,Institutional,...,280.0,https://t.me/indiacovid/3147?single,,,,1.01%,6.75,6748.0,99.0,397000.0
2,2020-04-27,Andaman and Nicobar Islands,2848.0,Samples Sent,33.0,,106.0,,724.0,Institutional,...,298.0,https://t.me/indiacovid/3365?single,,,,1.16%,7.17,7174.0,86.0,397000.0
3,2020-05-01,Andaman and Nicobar Islands,3754.0,Samples Sent,33.0,,199.0,,643.0,Institutional,...,340.0,https://t.me/indiacovid/3781,,,,0.88%,9.46,9456.0,114.0,397000.0
4,2020-05-16,Andaman and Nicobar Islands,6677.0,Samples Sent,33.0,,136.0,,16.0,Institutional,...,471.0,https://t.me/indiacovid/4925,,,,0.49%,16.82,16819.0,202.0,397000.0


In [41]:
st_test = state_test.groupby(by=['State', 'Updated On']).sum()

In [42]:
st_test.loc['Odisha'].sort_values(by='Updated On')[['Total Tested', 'Population NCP 2019 Projection']]

Unnamed: 0_level_0,Total Tested,Population NCP 2019 Projection
Updated On,Unnamed: 1_level_1,Unnamed: 2_level_1
2020-04-03,1395.0,43671000.0
2020-04-07,2441.0,43671000.0
2020-04-08,2441.0,43671000.0
2020-04-09,3249.0,43671000.0
2020-04-10,3547.0,43671000.0
...,...,...
2020-06-21,224402.0,43671000.0
2020-06-22,227860.0,43671000.0
2020-06-23,231356.0,43671000.0
2020-06-24,235627.0,0.0


In [43]:
# Plot daily case

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=st_test.loc['Odisha'].sort_values(by='Updated On')[['Total Tested', 'Population NCP 2019 Projection']].index,
        y=st_test.loc['Odisha'].sort_values(by='Updated On')[['Total Tested', 'Population NCP 2019 Projection']]['Total Tested'],
        name='Testing per day'
    ))

fig.update_layout(title="Daily Testing of COVID Cases in India")
fig.show()

# Timeline India with testing

In [150]:
h_num = date_wise(h_data)
r_num = date_wise(r_data)
d_num = date_wise(d_data)

india_df = pd.merge(h_num, r_num, on='dateannounced', how='outer', 
         suffixes=('_h', '')).sort_values(by='dateannounced')
india_df = pd.merge(india_df, d_num, on='dateannounced', how='outer', 
         suffixes=('_r', '_d')).sort_values(by='dateannounced')
india_df = pd.merge(india_df, icmr_test, on='dateannounced', how='outer').sort_values(by='dateannounced')
india_df

Unnamed: 0_level_0,numcases_h,total_cases_h,numcases_r,total_cases_r,numcases_d,total_cases_d,Total Samples Tested,daily_test
dateannounced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-30,,,1.0,1.0,,,,
2020-02-02,,,1.0,2.0,,,,
2020-02-03,,,1.0,3.0,,,,
2020-03-02,,,2.0,5.0,,,,
2020-03-03,,,1.0,6.0,,,,
...,...,...,...,...,...,...,...,...
2020-06-21,15151.0,426965.0,9075.0,237476.0,426.0,13749.0,6950493.0,143267.0
2020-06-22,13560.0,440525.0,10879.0,248355.0,312.0,14061.0,7137716.0,187223.0
2020-06-23,15656.0,456181.0,10462.0,258817.0,468.0,14529.0,7352911.0,215195.0
2020-06-24,16868.0,473049.0,13089.0,271906.0,424.0,14953.0,7560782.0,207871.0


In [269]:
# Plot daily case
fig = go.Figure()

fig.add_trace(go.Scatter( x=india_df.index, y=india_df.numcases_h, name='Active Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df.numcases_r, name='Recovered Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df.numcases_d, name='Deceased Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df.daily_test, name='Test Daily'))
# fig.add_trace(go.Scatter( x=india_df.index, y=india_df['Total Samples Tested'], name='Testing Numbers'))
fig.update_layout(title="Daily Cases of COVID")
fig.show()

In [151]:
# Plot cumulative case
fig = go.Figure()

fig.add_trace(go.Scatter( x=india_df.index, y=india_df.total_cases_h, name='Active Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df.total_cases_r, name='Recovered Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df.total_cases_d, name='Deceased Cases'))
fig.add_trace(go.Scatter( x=india_df.index, y=india_df['Total Samples Tested'], name='Testing Numbers'))
fig.update_layout(title="Daily Cases of COVID")
fig.show()

# Till date each state with testing

In [219]:
test = []
# Add test data for each state
for i in state_data.index:
    try:
        j = -1
        k = 0
        while k==0:
            k = st_test.loc[i]['Total Tested'].tail(1-j).values[j]
            j -= 1
        test.append(int(k))
    except:
        test.append(0)
state_data['total_test'] = test
state_data

Unnamed: 0_level_0,Confirmed,Recovered,Deceased,Active,total_test
detectedstate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,1,36,0,-35,0
Andaman and Nicobar Islands,47,52,0,-5,13994
Andhra Pradesh,10888,4990,136,5762,769319
Arunachal Pradesh,172,42,1,129,21274
Assam,6647,4034,9,2604,336091
Bihar,8487,6480,57,1950,181737
Chandigarh,423,329,6,88,7072
Chhattisgarh,2448,1737,12,699,142090
Dadra and Nagar Haveli and Daman and Diu,145,32,0,113,28726
Delhi,73776,44767,2430,26579,420707


# Timeline for each state with testing

In [220]:
state = 'Odisha'

In [239]:
def state_wise(data, state):
    state_tl = data.groupby(by=['detectedstate', 'dateannounced'])[['numcases']].sum().loc[state]
    state_tl['total_cases'] = state_tl['numcases'].cumsum()
    return state_tl
st_h = state_wise(h_data, state)
st_r = state_wise(r_data, state)
st_d = state_wise(d_data, state)

state_test = st_test.loc[state][['Total Tested']]
state_test['daily_test'] = state_test['Total Tested'].diff().fillna(
                                        state_test['Total Tested']).apply(int)
state_test.index.name = 'dateannounced'

In [240]:
state_df = pd.merge(st_h, st_r, on='dateannounced', how='outer', 
         suffixes=('_h', '')).sort_values(by='dateannounced')
state_df = pd.merge(state_df, st_d, on='dateannounced', how='outer', 
         suffixes=('_r', '_d')).sort_values(by='dateannounced')
state_df = pd.merge(state_df, state_test, on='dateannounced', how='outer').sort_values(by='dateannounced')
state_df

Unnamed: 0_level_0,numcases_h,total_cases_h,numcases_r,total_cases_r,numcases_d,total_cases_d,Total Tested,daily_test
dateannounced,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-16,1.0,1.0,,,,,,
2020-03-19,1.0,2.0,,,,,,
2020-03-26,1.0,3.0,,,,,,
2020-03-31,1.0,4.0,,,,,,
2020-04-01,1.0,5.0,1.0,1.0,,,,
...,...,...,...,...,...,...,...,...
2020-06-21,304.0,5159.0,186.0,3720.0,4.0,20.0,224402.0,4628.0
2020-06-22,143.0,5302.0,143.0,3863.0,2.0,22.0,227860.0,3458.0
2020-06-23,167.0,5469.0,125.0,3988.0,3.0,25.0,231356.0,3496.0
2020-06-24,282.0,5751.0,135.0,4123.0,,,235627.0,4271.0


In [241]:
# Plot daily case
fig = go.Figure()

fig.add_trace(go.Scatter( x=state_df.index, y=state_df.numcases_h, name='Active Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df.numcases_r, name='Recovered Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df.numcases_d, name='Deceased Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df.daily_test, name='Test Daily'))
# fig.add_trace(go.Scatter( x=state_df.index, y=state_df['Total Samples Tested'], name='Testing Numbers'))
fig.update_layout(title="Daily Cases of COVID")
fig.show()

In [243]:
# Plot cumulative case
fig = go.Figure()

fig.add_trace(go.Scatter( x=state_df.index, y=state_df.total_cases_h, name='Active Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df.total_cases_r, name='Recovered Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df.total_cases_d, name='Deceased Cases'))
fig.add_trace(go.Scatter( x=state_df.index, y=state_df['Total Tested'], name='Testing Numbers'))
fig.update_layout(title="Daily Cases of COVID")
fig.show()


# Timeline for each district

In [251]:
h_dist = h_data.groupby(by=['detectedstate', 'detecteddistrict', 'dateannounced'])[['numcases']].sum()
r_dist = r_data.groupby(by=['detectedstate', 'detecteddistrict', 'dateannounced'])[['numcases']].sum()
d_dist = d_data.groupby(by=['detectedstate', 'detecteddistrict', 'dateannounced'])[['numcases']].sum()

In [255]:
district = 'Bhadrak'
d_dist.loc[state]

Unnamed: 0_level_0,Unnamed: 1_level_0,numcases
detecteddistrict,dateannounced,Unnamed: 2_level_1
Balasore,2020-05-29,1
Bargarh,2020-06-21,1
Cuttack,2020-05-19,1
Cuttack,2020-06-23,3
Ganjam,2020-05-09,1
Ganjam,2020-05-17,1
Ganjam,2020-05-21,1
Ganjam,2020-06-07,1
Ganjam,2020-06-12,1
Ganjam,2020-06-14,1


In [265]:
def dist_wise(data, state, district):
    try:
        p = data.loc[state, district]
        dist_tl = p[['numcases']]
        dist_tl['total_cases'] = dist_tl['numcases'].cumsum()
        return dist_tl
    except:
        dist_tl = pd.DataFrame(columns=['numcases'])
        dist_tl['numcases'] = [0] * dist_df.shape[0]
        dist_tl['total_cases'] = dist_tl['numcases'].cumsum()
        return dist_tl
        

dist_df = pd.merge(h_dist.loc[state, district], r_dist.loc[state, district], on='dateannounced', how='outer', 
         suffixes=('_h', '')).sort_values(by='dateannounced')
# dist_df = pd.merge(dist_df, d_dist.loc[state, district], on='dateannounced', how='outer', 
#          suffixes=('_r', '_d')).sort_values(by='dateannounced')
# dist_df = pd.merge(dist_df, state_test, on='dateannounced', how='outer').sort_values(by='dateannounced')
# dist_df
dist_wise(d_dist, state, district)

Unnamed: 0,numcases,total_cases
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0
