<a href="https://colab.research.google.com/github/nitin-khandagale/blogposts/blob/master/covid_global_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import os
import plotly.graph_objs as go
from plotly.offline import iplot
import seaborn as sns

In [0]:
df = pd.read_csv('covid_global.csv')

In [0]:
df.head()

Unnamed: 0,daterep,day,month,year,cases,deaths,countriesandterritories,geoid,countryterritorycode,popdata2018,continentexp
0,,28,4,2020,172,0,Afghanistan,AF,AFG,37172386.0,Asia
1,,27,4,2020,68,10,Afghanistan,AF,AFG,37172386.0,Asia
2,,26,4,2020,112,4,Afghanistan,AF,AFG,37172386.0,Asia
3,,25,4,2020,70,1,Afghanistan,AF,AFG,37172386.0,Asia
4,,24,4,2020,105,2,Afghanistan,AF,AFG,37172386.0,Asia


In [0]:
df.isnull().sum()

daterep                    13623
day                            0
month                          0
year                           0
cases                          0
deaths                         0
countriesandterritories        0
geoid                         45
countryterritorycode         152
popdata2018                  126
continentexp                   0
dtype: int64

In [0]:
df.shape

(13623, 11)

In [0]:
df = df.drop(['daterep', 'popdata2018'], axis='columns')

In [0]:
df.head()

Unnamed: 0,day,month,year,cases,deaths,countriesandterritories,geoid,countryterritorycode,continentexp
0,28,4,2020,172,0,Afghanistan,AF,AFG,Asia
1,27,4,2020,68,10,Afghanistan,AF,AFG,Asia
2,26,4,2020,112,4,Afghanistan,AF,AFG,Asia
3,25,4,2020,70,1,Afghanistan,AF,AFG,Asia
4,24,4,2020,105,2,Afghanistan,AF,AFG,Asia


In [0]:
df.columns = ['day', 'month', 'year', 'cases', 'deaths', 'country_terri', 'geo_id', 'country_code','continent']

In [0]:
df['date'] = pd.to_datetime(df[['day', 'month', 'year']]).dt.strftime('%d-%m-%Y')

In [0]:
df.head()

Unnamed: 0,day,month,year,cases,deaths,country_terri,geo_id,country_code,continent,date
0,28,4,2020,172,0,Afghanistan,AF,AFG,Asia,28-04-2020
1,27,4,2020,68,10,Afghanistan,AF,AFG,Asia,27-04-2020
2,26,4,2020,112,4,Afghanistan,AF,AFG,Asia,26-04-2020
3,25,4,2020,70,1,Afghanistan,AF,AFG,Asia,25-04-2020
4,24,4,2020,105,2,Afghanistan,AF,AFG,Asia,24-04-2020


In [0]:
df.isnull().sum()

day                0
month              0
year               0
cases              0
deaths             0
country_terri      0
geo_id            45
country_code     152
continent          0
date               0
dtype: int64

In [0]:
df[df['country_code'].isnull()]['country_terri'].unique().keys

array(['Anguilla', 'Bonaire, Saint Eustatius and Saba',
       'Cases_on_an_international_conveyance_Japan',
       'Falkland_Islands_(Malvinas)', 'Western_Sahara'], dtype=object)

In [0]:
AIA. BQ, JPN, FLK, ESH

In [0]:
def fill_data(x):
  if x=='Anguilla':
    return 'AIA'
  elif x=='Bonaire, Saint Eustatius and Saba':
    return 'BQ'
  elif x=='Cases_on_an_international_conveyance_Japan':
    return 'JPN'
  elif x=='Falkland_Islands_(Malvinas)':
    return 'FLK'
  else:
    return 'ESH'

In [0]:
df['country_code'] = df['country_code'].fillna(df['country_terri'].apply(fill_data))  

In [0]:
df['geo_id'] = df['geo_id'].fillna('NA')

In [0]:
df.isnull().sum()

day              0
month            0
year             0
cases            0
deaths           0
country_terri    0
geo_id           0
country_code     0
continent        0
date             0
dtype: int64

In [0]:
df = df.drop(['day', 'month', 'year'], axis='columns')

In [0]:
df['date'] = pd.to_datetime(df['date'], dayfirst=True)

In [0]:
df = df.set_index('date')

In [0]:
df.head()

Unnamed: 0_level_0,cases,deaths,country_terri,geo_id,country_code,continent
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-28,172,0,Afghanistan,AF,AFG,Asia
2020-04-27,68,10,Afghanistan,AF,AFG,Asia
2020-04-26,112,4,Afghanistan,AF,AFG,Asia
2020-04-25,70,1,Afghanistan,AF,AFG,Asia
2020-04-24,105,2,Afghanistan,AF,AFG,Asia


In [0]:
group_conti = df.groupby('continent')['cases','deaths'].sum().reset_index()

fig = px.pie(group_conti, names='continent', values='cases',color_discrete_sequence=px.colors.sequential.RdBu)
fig.update_layout(title='Cases by Continent so far')
fig.update_traces(textposition='inside', textinfo='percent+label',pull=[0, 0, 0, 0.1])
fig.show()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [0]:
df.head()

Unnamed: 0_level_0,cases,deaths,country_terri,geo_id,country_code,continent
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-28,172,0,Afghanistan,AF,AFG,Asia
2020-04-27,68,10,Afghanistan,AF,AFG,Asia
2020-04-26,112,4,Afghanistan,AF,AFG,Asia
2020-04-25,70,1,Afghanistan,AF,AFG,Asia
2020-04-24,105,2,Afghanistan,AF,AFG,Asia


In [0]:
fig = go.Figure(data=[
                      go.Bar(name='Total Cases', y=group_conti['cases'], x=group_conti['continent'], marker_color='grey'),
                      go.Bar(name='Deaths', y=group_conti['deaths'], x=group_conti['continent'], marker_color='red')
])

fig.update_layout(title='Cases and Deaths Comparison by Continent', barmode='group')
fig.show()

In [0]:
df.head()

Unnamed: 0_level_0,cases,deaths,country_terri,geo_id,country_code,continent
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-28,172,0,Afghanistan,AF,AFG,Asia
2020-04-27,68,10,Afghanistan,AF,AFG,Asia
2020-04-26,112,4,Afghanistan,AF,AFG,Asia
2020-04-25,70,1,Afghanistan,AF,AFG,Asia
2020-04-24,105,2,Afghanistan,AF,AFG,Asia


In [0]:
country_group = df.groupby('country_terri')['cases', 'deaths'].sum().reset_index()


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



In [0]:
top_ten = country_group.nlargest(10, 'cases')

fig = go.Figure(data=[
                      go.Bar(name='Cases', x=top_ten['country_terri'], y=top_ten['cases'], marker_color='lightblue'),
                      go.Bar(name='Deaths', x=top_ten['country_terri'], y=top_ten['deaths'], marker_color='indianred')

])

fig.update_layout(title='Cases and Deaths ratio in 10 most affected Nations')
fig.show()

In [0]:
df.head()

Unnamed: 0_level_0,cases,deaths,country_terri,geo_id,country_code,continent
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-28,172,0,Afghanistan,AF,AFG,Asia
2020-04-27,68,10,Afghanistan,AF,AFG,Asia
2020-04-26,112,4,Afghanistan,AF,AFG,Asia
2020-04-25,70,1,Afghanistan,AF,AFG,Asia
2020-04-24,105,2,Afghanistan,AF,AFG,Asia


In [0]:
m = df.cases.resample('M').sum()
w = df.cases.resample('W').sum()

months_sum = pd.DataFrame(df.cases.resample('M').sum())

In [0]:
df.index.month

In [0]:
months = [calendar.month_name[x] for x in (v.index.month).tolist()]
months_sum = pd.DataFrame(df.cases.resample('M').sum())
months_sum_deaths = pd.DataFrame(df.deaths.resample('M').sum())

In [0]:
fig = px.line(x=months, y=months_sum, line_shape='spline', 
              color_discrete_sequence=['indianred'])
fig.update_layout(title='Worldwide Cases Monthly Graph', xaxis_title='Month', yaxis_title='Positive cases')
fig.show()

In [0]:
fig = px.line(x=months, y=months_sum_deaths, color_discrete_sequence=['red'])
fig.update_layout(title='Worldwide Deaths Monthly Graph', xaxis_title='Months', yaxis_title='Deaths')
fig.show()

In [0]:
try_group = df.groupby(['dated','country_terri'])['cases', 'deaths'].sum().reset_index()

In [0]:
try_group[try_group['country_terri']=='India'].nlargest(1, 'cases')['deaths']

pandas.core.series.Series

In [0]:
top_ten_con = country_group.nlargest(10, 'cases')['country_terri'].tolist()

In [0]:
highest_cases = []
highest_deaths = []

for country in top_ten_con:
  g = try_group[try_group['country_terri']==country].nlargest(1, 'cases')['cases'].values[0]
  highest_cases.append(g)

  h = try_group[try_group['country_terri']==country].nlargest(1, 'cases')['deaths'].values[0]
  highest_deaths.append(h)

In [0]:
highest_cases

[48529, 9222, 6557, 8719, 6294, 7578, 5138, 5275, 6361, 15141]

In [0]:
fig = px.bar(x=top_ten_con, y=highest_cases, color_discrete_sequence=['grey'])
fig.update_layout(title='Highest Daily Rise in Cases in Most Affected Nations',
                  xaxis_title='Countries',
                  yaxis_title='Cases')
fig.show()

In [0]:
fig = px.bar(x=top_ten_con, y=highest_deaths, color_discrete_sequence=['cyan'])
fig.update_layout(title='Highest Daily Rise in Deaths in Most Affected Nations',
                  xaxis_title='Countries',
                  yaxis_title='Cases')
fig.show()