Load Libraries and display system info

In [1]:
# System info data
import os
import platform
from platform import python_version

In [2]:
# Import Libraries
import jupyterlab
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
plt.rcParams['figure.figsize'] = [15, 5]
from IPython import display
from ipywidgets import interact, widgets
import pycountry
import pycountry_convert

In [3]:
print("System")
print("os name: %s" % os.name)
print("system: %s" % platform.system())
print("release: %s" % platform.release())
print()
print("Python")
print("version: %s" % python_version())
print()
print("Python Packages")
print("jupterlab==%s" % jupyterlab.__version__)
print("pandas==%s" % pd.__version__)

System
os name: posix
system: Darwin
release: 19.4.0

Python
version: 3.7.3

Python Packages
jupterlab==1.0.2
pandas==0.24.2


## Load data for Cases, Deaths and Recovered

In [43]:
# Load Dataset
# Source: RamiKrispin GitHub
dataset_url = 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'

raw_data_all = pd.read_csv(dataset_url)

## Inspect & clean Dataset

In [44]:
# Print Information about dataset
raw_data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68464 entries, 0 to 68463
Data columns (total 7 columns):
Province.State    20328 non-null object
Country.Region    68464 non-null object
Lat               68464 non-null float64
Long              68464 non-null float64
date              68464 non-null object
cases             68464 non-null int64
type              68464 non-null object
dtypes: float64(2), int64(1), object(4)
memory usage: 3.7+ MB


In [45]:
# Print first 5 rows of dataset
raw_data_all.head()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type
0,,Afghanistan,33.0,65.0,2020-01-22,0,confirmed
1,,Afghanistan,33.0,65.0,2020-01-23,0,confirmed
2,,Afghanistan,33.0,65.0,2020-01-24,0,confirmed
3,,Afghanistan,33.0,65.0,2020-01-25,0,confirmed
4,,Afghanistan,33.0,65.0,2020-01-26,0,confirmed


In [46]:
# Missing values per column
raw_data_all.isna().sum()

Province.State    48136
Country.Region        0
Lat                   0
Long                  0
date                  0
cases                 0
type                  0
dtype: int64

In [157]:
raw_data_all.isnull().sum().sum()

48136

In [318]:
coords = raw_data_all.groupby(['Lat', 'Long'])#.size()

In [321]:
raw_data_all.groupby(['Lat', 'Long']).size().reset_index()

Unnamed: 0,Lat,Long,0
0,-51.7963,-59.5236,264
1,-41.4545,145.9707,264
2,-40.9006,174.886,264
3,-38.4161,-63.6167,264
4,-37.8136,144.9631,264
5,-35.6751,-71.543,264
6,-35.4735,149.0124,264
7,-34.9285,138.6007,264
8,-33.8688,151.2093,264
9,-32.5228,-55.7658,264


In [273]:
grouped = raw_data_all.groupby(['Country.Region', 'type'])

In [279]:
#for name, group in grouped:
#    print(name)
#    print(group)
#    print()
    

In [276]:
germany = grouped.get_group(('Germany', 'confirmed')).aggregate(np.sum)

In [314]:
#pivot = raw_data_all.set_index(['date', 'Country.Region'])
#pivot = pivot.groupby(['Country.Region', 'date'])
pivot = raw_data_all.pivot_table(index=['date', 'Country.Region'], columns=['type'], values=['cases'])
pivot = pivot.unstack()

In [315]:
pivot.head()

Unnamed: 0_level_0,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases,cases
type,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,confirmed,...,recovered,recovered,recovered,recovered,recovered,recovered,recovered,recovered,recovered,recovered
Country.Region,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,Uruguay,Uzbekistan,Venezuela,Vietnam,West Bank and Gaza,Western Sahara,Yemen,Zambia,Zimbabwe
date,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
2020-01-22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-24,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-01-26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [267]:
grouped.aggregate(np.sum)

Unnamed: 0_level_0,Unnamed: 1_level_0,Lat,Long,cases
date,Country.Region,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-22,Afghanistan,99.000000,195.000000,0
2020-01-22,Albania,123.459900,60.504900,0
2020-01-22,Algeria,84.101700,4.978800,0
2020-01-22,Andorra,127.518900,4.565400,0
2020-01-22,Angola,-33.608100,53.621700,0
2020-01-22,Antigua and Barbuda,51.182400,-185.389200,0
2020-01-22,Argentina,-115.248300,-190.850100,0
2020-01-22,Armenia,120.207300,135.114600,0
2020-01-22,Australia,-767.908500,3389.586900,0
2020-01-22,Austria,142.548600,43.650300,0


In [158]:
raw_data_all['cases'].isna()

0        False
1        False
2        False
3        False
4        False
5        False
6        False
7        False
8        False
9        False
10       False
11       False
12       False
13       False
14       False
15       False
16       False
17       False
18       False
19       False
20       False
21       False
22       False
23       False
24       False
25       False
26       False
27       False
28       False
29       False
30       False
31       False
32       False
33       False
34       False
35       False
36       False
37       False
38       False
39       False
40       False
41       False
42       False
43       False
44       False
45       False
46       False
47       False
48       False
49       False
50       False
51       False
52       False
53       False
54       False
55       False
56       False
57       False
58       False
59       False
60       False
61       False
62       False
63       False
64       False
65       False
66       F

In [47]:
# Datetime: convert 'date' column to datetime type
raw_data_all['date'] = pd.to_datetime(raw_data_all['date'])

In [66]:
# Print Information about dataset
raw_data_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68464 entries, 0 to 68463
Data columns (total 7 columns):
Province.State    20328 non-null object
Country.Region    68464 non-null object
Lat               68464 non-null float64
Long              68464 non-null float64
date              68464 non-null datetime64[ns]
cases             68464 non-null int64
type              68464 non-null object
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 3.7+ MB


## Transform Dataset

In [67]:
# Unique values in 'type'
raw_data_all['type'].unique()

array(['confirmed', 'death', 'recovered'], dtype=object)

In [230]:
#raw_data_all

In [82]:
# Create empty dictionary for dataframes
d = {}

In [236]:
for case_type in raw_data_all['type'].unique():
    df = raw_data_all[raw_data_all['type'] == case_type]
    df = df.rename(columns={'cases': 'daily_' + case_type})
    df = df.drop(columns=['type'])
    df = df.set_index(['Province.State', 'Country.Region', 'Lat', 'Long', 'date'])
    d['df_{}'.format(case_type)] = df
    

In [237]:
d.keys()

dict_keys(['df_confirmed', 'df_death', 'df_recovered'])

In [240]:
#len(d['df_confirmed']['Country.Region'].unique())

In [None]:
#len(d['df_death']['Country.Region'].unique())

In [None]:
#len(d['df_recovered']['Country.Region'].unique())

In [239]:
d['df_confirmed'].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,daily_confirmed
Province.State,Country.Region,Lat,Long,date,Unnamed: 5_level_1
,Afghanistan,33.0,65.0,2020-01-22,0
,Afghanistan,33.0,65.0,2020-01-23,0
,Afghanistan,33.0,65.0,2020-01-24,0
,Afghanistan,33.0,65.0,2020-01-25,0
,Afghanistan,33.0,65.0,2020-01-26,0


In [202]:
23232+23232+22000

68464

In [160]:
d['df_confirmed'][d['df_confirmed']['Country.Region'] == "Timor-Leste"]

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_confirmed
14344,,Timor-Leste,-8.874217,125.727539,2020-01-22,0
14345,,Timor-Leste,-8.874217,125.727539,2020-01-23,0
14346,,Timor-Leste,-8.874217,125.727539,2020-01-24,0
14347,,Timor-Leste,-8.874217,125.727539,2020-01-25,0
14348,,Timor-Leste,-8.874217,125.727539,2020-01-26,0
14349,,Timor-Leste,-8.874217,125.727539,2020-01-27,0
14350,,Timor-Leste,-8.874217,125.727539,2020-01-28,0
14351,,Timor-Leste,-8.874217,125.727539,2020-01-29,0
14352,,Timor-Leste,-8.874217,125.727539,2020-01-30,0
14353,,Timor-Leste,-8.874217,125.727539,2020-01-31,0


In [200]:
d['df_death'][d['df_death']['Country.Region'] == "Timor-Leste"]

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_death
37576,,Timor-Leste,-8.874217,125.727539,2020-01-22,0
37577,,Timor-Leste,-8.874217,125.727539,2020-01-23,0
37578,,Timor-Leste,-8.874217,125.727539,2020-01-24,0
37579,,Timor-Leste,-8.874217,125.727539,2020-01-25,0
37580,,Timor-Leste,-8.874217,125.727539,2020-01-26,0
37581,,Timor-Leste,-8.874217,125.727539,2020-01-27,0
37582,,Timor-Leste,-8.874217,125.727539,2020-01-28,0
37583,,Timor-Leste,-8.874217,125.727539,2020-01-29,0
37584,,Timor-Leste,-8.874217,125.727539,2020-01-30,0
37585,,Timor-Leste,-8.874217,125.727539,2020-01-31,0


In [201]:
d['df_recovered'][d['df_recovered']['Country.Region'] == "Timor-Leste"]

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_recovered
60896,,Timor-Leste,-8.8742,125.7275,2020-01-22,0
60897,,Timor-Leste,-8.8742,125.7275,2020-01-23,0
60898,,Timor-Leste,-8.8742,125.7275,2020-01-24,0
60899,,Timor-Leste,-8.8742,125.7275,2020-01-25,0
60900,,Timor-Leste,-8.8742,125.7275,2020-01-26,0
60901,,Timor-Leste,-8.8742,125.7275,2020-01-27,0
60902,,Timor-Leste,-8.8742,125.7275,2020-01-28,0
60903,,Timor-Leste,-8.8742,125.7275,2020-01-29,0
60904,,Timor-Leste,-8.8742,125.7275,2020-01-30,0
60905,,Timor-Leste,-8.8742,125.7275,2020-01-31,0


In [225]:
merged_df[merged_df['Country.Region'] == 'Timor-Leste']

Unnamed: 0,Province.State_confirmed,Country.Region_confirmed,Lat_confirmed,Long_confirmed,date_confirmed,daily_confirmed,Province.State_death,Country.Region_death,Lat_death,Long_death,date_death,daily_death,Province.State,Country.Region,Lat,Long,date,daily_recovered


In [162]:
raw_data_all.columns

Index(['Province.State', 'Country.Region', 'Lat', 'Long', 'date', 'cases',
       'type'],
      dtype='object')

In [171]:
d['df_confirmed'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23232 entries, 0 to 23231
Data columns (total 6 columns):
Province.State     7216 non-null object
Country.Region     23232 non-null object
Lat                23232 non-null float64
Long               23232 non-null float64
date               23232 non-null datetime64[ns]
daily_confirmed    23232 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [172]:
d['df_death'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23232 entries, 23232 to 46463
Data columns (total 6 columns):
Province.State    7216 non-null object
Country.Region    23232 non-null object
Lat               23232 non-null float64
Long              23232 non-null float64
date              23232 non-null datetime64[ns]
daily_death       23232 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [173]:
d['df_recovered'].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22000 entries, 46464 to 68463
Data columns (total 6 columns):
Province.State     5896 non-null object
Country.Region     22000 non-null object
Lat                22000 non-null float64
Long               22000 non-null float64
date               22000 non-null datetime64[ns]
daily_recovered    22000 non-null int64
dtypes: datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 1.2+ MB


In [163]:
joining_cols = ['Province.State', 'Country.Region', 'Lat', 'Long', 'date']

In [242]:
#merged_df = pd.merge(d['df_confirmed'], d['df_death'], how='outer', on=joining_cols)
#merged_df = d['df_confirmed'].join(d['df_death'],  how='left', lsuffix='_confirmed', rsuffix='_death')

merged_df = d['df_confirmed'].combine_first(d['df_death'])

In [244]:
#merged_df = pd.merge(merged_df, d['df_recovered'], how='left', left_on=joining_cols, right_on=joining_cols)
#merged_df = merged_df.join(d['df_recovered'],  how='left')

merged_df = merged_df.combine_first(d['df_recovered'])

In [245]:
merged_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,daily_confirmed,daily_death,daily_recovered
Province.State,Country.Region,Lat,Long,date,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0,0.0
,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0,0.0
,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0,0.0
,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0,0.0
,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0,0.0


In [247]:
merged_df.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,daily_confirmed,daily_death,daily_recovered
Province.State,Country.Region,Lat,Long,date,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,Timor-Leste,-8.8742,125.7275,2020-04-14,,,0.0
,Timor-Leste,-8.8742,125.7275,2020-04-15,,,0.0
,Timor-Leste,-8.8742,125.7275,2020-04-16,,,0.0
,Timor-Leste,-8.8742,125.7275,2020-04-17,,,0.0
,Timor-Leste,-8.8742,125.7275,2020-04-18,,,0.0


In [205]:
#merged_df = pd.merge(merged_df, d['df_recovered'], how='outer', on=joining_cols)

In [246]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 23584 entries, (nan, Afghanistan, 33.0, 65.0, 2020-01-22 00:00:00) to (nan, Timor-Leste, -8.8742, 125.7275, 2020-04-18 00:00:00)
Data columns (total 3 columns):
daily_confirmed    23232 non-null float64
daily_death        23232 non-null float64
daily_recovered    22000 non-null float64
dtypes: float64(3)
memory usage: 744.1+ KB


In [186]:
print(d['df_confirmed'].shape)
print(d['df_death'].shape)
print(d['df_recovered'].shape)

(23232, 6)
(23232, 6)
(22000, 6)


In [196]:
df_missing = d['df_confirmed'].merge(d['df_recovered'], how='outer', indicator=True).loc[lambda x: x['_merge'] == 'left_only']

In [198]:
df_missing.shape

(1584, 8)

In [207]:
# Missing values per column
merged_df.isna().sum().sum()

18656

In [195]:
print(len(merged_df['Country.Region'].unique()))
#print(merged_df['Country.Region'].unique())

185


In [123]:
print(len(raw_data_all['Country.Region'].unique()))

185


In [146]:
#m_list = list(merged_df['Country.Region'].unique())
m_set = set(merged_df['Country.Region'])
#r_list = list(raw_data_all['Country.Region'].unique())
r_set = set(raw_data_all['Country.Region'])

#diff = r_list.difference(m_list)
#type(m_list)
print(len(m_set))
print(len(r_set))
print(r_set - m_set)
#print(m_set.difference(r_set))

#rec_set = set(d['df_recovered']['Country.Region'])
#print(len(rec_set))

#print(m_set - rec_set)

185
185
set()


In [193]:
merged_df.head()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_confirmed,daily_death,daily_recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0.0,0.0,0.0
1,,Afghanistan,33.0,65.0,2020-01-23,0.0,0.0,0.0
2,,Afghanistan,33.0,65.0,2020-01-24,0.0,0.0,0.0
3,,Afghanistan,33.0,65.0,2020-01-25,0.0,0.0,0.0
4,,Afghanistan,33.0,65.0,2020-01-26,0.0,0.0,0.0


In [208]:
merged_df.tail()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_confirmed,daily_death,daily_recovered
23579,,Timor-Leste,-8.8742,125.7275,2020-04-14,,,0.0
23580,,Timor-Leste,-8.8742,125.7275,2020-04-15,,,0.0
23581,,Timor-Leste,-8.8742,125.7275,2020-04-16,,,0.0
23582,,Timor-Leste,-8.8742,125.7275,2020-04-17,,,0.0
23583,,Timor-Leste,-8.8742,125.7275,2020-04-18,,,0.0


In [155]:
merged_df[merged_df['Country.Region'] == 'Germany']

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,daily_confirmed,daily_death,daily_recovered
5456,,Germany,51.0,9.0,2020-01-22,0.0,0.0,0.0
5457,,Germany,51.0,9.0,2020-01-23,0.0,0.0,0.0
5458,,Germany,51.0,9.0,2020-01-24,0.0,0.0,0.0
5459,,Germany,51.0,9.0,2020-01-25,0.0,0.0,0.0
5460,,Germany,51.0,9.0,2020-01-26,0.0,0.0,0.0
5461,,Germany,51.0,9.0,2020-01-27,1.0,0.0,0.0
5462,,Germany,51.0,9.0,2020-01-28,3.0,0.0,0.0
5463,,Germany,51.0,9.0,2020-01-29,0.0,0.0,0.0
5464,,Germany,51.0,9.0,2020-01-30,0.0,0.0,0.0
5465,,Germany,51.0,9.0,2020-01-31,1.0,0.0,0.0


In [53]:
# Reset index on data frame & drop old index
#df_all = df_all.reset_index(drop=True)
#pivot = pivot.reset_index()

In [17]:
raw_data_all.columns

Index(['Country.Region', 'Province.State', 'Lat', 'Long', 'date', 'confirmed',
       'death', 'recovered'],
      dtype='object', name='type')

In [41]:
#pd.set_option('display.max_rows', raw_data_all.shape[0]+1)

In [None]:
raw_data_all['Cumulative Confirmed Cases'] = raw_data_all['confirmed'].cumsum()

In [None]:
df_with_coords = raw_data_all

In [None]:
group = raw_data_all.groupby(['Country.Region', 'Province.State', 'date', 'type'])['cases']

In [None]:
#group.head()

In [None]:
df_with_coords['cumulative'] = df_with_coords.groupby(['Country.Region', 'Province.State', 'date', 'type'])['cases'].cumsum()# \
                                            #.apply(lambda x: x.cumsum())

In [None]:
df_with_coords.tail(50)

In [None]:
raw_data_all.head()

## Clean up Provice.State

**Goal:** only Country should remain to specify location

In [None]:
# Create temporary data frame: Select all row where 'Province.State' == NaN
temp_df_notna = raw_data_all[pd.notna(raw_data_all['Province.State'])]
#
# temp_df_notna.head()

**Provinces:**
Sum up data in Provinces of China and Australia

In [None]:
#
temp_df_province = temp_df_notna[(temp_df_notna['Country.Region'] == 'China') | (temp_df_notna['Country.Region'] == 'Australia')]
#temp_df_province.head()
temp_df_province.info()

In [None]:
temp_df_province = temp_df_province.groupby(['Country.Region', 'date', 'type']) \
                                    .sum() \
                                    .reset_index()

In [None]:
temp_df_province.shape

**Rename former Colonies:** 
Select all data where Country is not China, Australia or Canada.
Delete original Country name column and replace it with Province names.

In [None]:
# Select all rows where Country is neither China, nor Australia or Canada
temp_df_colonies = temp_df_notna[(temp_df_notna['Country.Region'] != 'China') 
                                 & (temp_df_notna['Country.Region'] != 'Australia')
                                & (temp_df_notna['Country.Region'] != 'Canada')] \
                                .drop('Country.Region', axis=1) \
                                .rename(columns={'Province.State': 'Country.Region'})


In [None]:
temp_df_colonies.shape

**Province == NaN:**
Remove Province.State column from all rows where Country is not Canada

In [None]:
# Select all rows where Country != Canada
temp_df_not_canada = raw_data_all[raw_data_all['Country.Region'] != 'Canada']

# Select all rows where Province is NaN
temp_df_not_canada = temp_df_not_canada[temp_df_not_canada['Province.State'].isna()]

# Delete Province column
temp_df_not_canada = temp_df_not_canada.drop('Province.State', axis=1)

In [None]:
temp_df_not_canada.shape

**Canada:**

In [None]:
temp_df_canada = raw_data_all[raw_data_all['Country.Region'] == 'Canada']



In [None]:
temp_df_canada = temp_df_canada.groupby(['Country.Region', 'date', 'type']) \
                                    .sum() \
                                    .reset_index()

In [None]:
# Print all shapes
print("Shape states:")
print(temp_df_not_canada.shape)
print("Shape states:")
print(temp_df_canada.shape)
print("Shape states:")
print(temp_df_province.shape)
print("Shape states:")
print(temp_df_colonies.shape)

In [None]:
temp_df_not_canada.shape[0] + temp_df_canada.shape[0] + temp_df_province.shape[0] + temp_df_colonies.shape[0]

**Join temp dataframes together**

In [None]:
df_all = pd.concat([temp_df_not_canada, temp_df_province, temp_df_colonies, temp_df_canada], axis=0, sort=True)



In [None]:
df_all.shape

In [None]:
# Delete temporary data frames
# del temp_df_not_canada, temp_df_province, temp_df_colonies, temp_df_canada

---

In [None]:
# Reset index on data frame & drop old index
df_all = df_all.reset_index(drop=True)

In [None]:
df_all.head()

In [None]:
df_all.tail()

In [None]:
df_all.isna().sum()

In [None]:
#df_all[df_all['Country.Region'] == 'China']

In [None]:
# Fix coordinates in summed up countries
#df_all.loc[df_all['Country.Region'] == 'China', 'Long']
#df_all.loc[df_all['Country.Region'] == 'China', 'Lat']

#df_all.loc[df_all['Country.Region'] == 'China', 'Long']
#df_all.loc[df_all['Country.Region'] == 'China', 'Lat']

#df_all.loc[df_all['Country.Region'] == 'China', 'Long']
#df_all.loc[df_all['Country.Region'] == 'China', 'Lat']

In [None]:
def get_country_code(country):
    try:
    return pycountry.countries.lookup(country)

In [None]:
#df_all['Country.Region'].apply(get_country_code)

In [None]:
len(pycountry.countries)

list(pycountry.countries)[0]

In [None]:
from pycountry import countries

In [None]:
for c in countries:
    print(c)

In [None]:
import pycountry_convert as pc

country_code = pc.country_name_to_country_alpha2("China", cn_name_format="default")
print(country_code)
continent_name = pc.country_alpha2_to_continent_code(country_code)
print(continent_name)

---

In [None]:
# Reshape dataset: pivot to wide
# Index name
raw_data_all.index.name = 'index_corona' # naming the index column

# Index range = Number of observations
raw_data_all.index


In [None]:
# Fill NaN in Province.State with 'missing'
raw_data_all['Province.State'] = raw_data_all['Province.State'].fillna('missing')

In [None]:
df_all.index.get_level_values(2)

In [None]:
# Set indices
df_all = raw_data_all.set_index(['Country.Region', 'Province.State', 'date'])

In [None]:
df_all.tail()

In [None]:
df_all.info()

In [None]:
df_all

In [None]:
df_all.xs('Israel', level = 'Country.Region')

In [None]:
raw_data_all.describe()

In [None]:
# Select data where column 'Province.State' is non-null
df_provinces = raw_data_confirmed.loc[raw_data_confirmed['Province.State'].notna()]
#df_provinces2 = df_provinces[(df_provinces["Country.Region"] == "China") | (df_provinces["Country.Region"] == "Australia")]
#df_provinces2.groupby(['Country.Region', 'date', 'type']) #.agg({'cases': 'sum'})
#df_provinces2['cumulative'] = df_provinces2['cases'].cumsum()
df_provinces

In [None]:
df_provinces.info()
df_provinces2["Country.Region"] == "Australia"