## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Set style & figures inline
sns.set()
%matplotlib inline
#import chart_studio as py
#from plotly.grid_objs import Grid, Column
from plotly.tools import FigureFactory as FF
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
plt.rcParams['figure.figsize'] = [15, 5]
from IPython import display
from ipywidgets import interact, widgets
import pycountry
import pycountry_convert

In [2]:
from ast import literal_eval as make_tuple

## Load data for Cases, Deaths and Recovered

In [3]:
# Load Dataset
# Source: RamiKrispin GitHub
dataset_url = 'https://raw.githubusercontent.com/RamiKrispin/coronavirus-csv/master/coronavirus_dataset.csv'

raw_data_all = pd.read_csv(dataset_url)
raw_data_all.head()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type
0,,Afghanistan,33.0,65.0,2020-01-22,0,confirmed
1,,Afghanistan,33.0,65.0,2020-01-23,0,confirmed
2,,Afghanistan,33.0,65.0,2020-01-24,0,confirmed
3,,Afghanistan,33.0,65.0,2020-01-25,0,confirmed
4,,Afghanistan,33.0,65.0,2020-01-26,0,confirmed


---

In [4]:
# Data urls
base_url = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/'
confirmed_cases_data_url = base_url + 'time_series_covid19_confirmed_global.csv'
death_cases_data_url = base_url + 'time_series_covid19_deaths_global.csv'
recovery_cases_data_url = base_url+ 'time_series_covid19_recovered_global.csv'

# Import datasets as pandas dataframes
raw_data_confirmed = pd.read_csv(confirmed_cases_data_url)
raw_data_deaths = pd.read_csv(death_cases_data_url)
raw_data_recovered = pd.read_csv(recovery_cases_data_url)

In [5]:
# Group by region (also drop 'Lat', 'Long' as it doesn't make sense to sum them here)
confirmed_country = raw_data_confirmed.groupby(['Country/Region']).sum().drop(['Lat', 'Long'], axis=1)
#confirmed_country.head()

In [6]:
confirmed_country = confirmed_country.transpose()
#confirmed_country.head()

In [7]:
# Set index as DateTimeIndex
datetime_index = pd.DatetimeIndex(confirmed_country.index)
confirmed_country.set_index(datetime_index, inplace=True)
# Check out index
#confirmed_country.index

---

In [8]:
print('Size/Shape of the dataset: ', raw_data_all.shape)
print('\n')
print('Checking for missing values:')
print(raw_data_all.isnull().sum())
print('\n')
print('Checking data type of each column:')
print(raw_data_all.dtypes)

Size/Shape of the dataset:  (73910, 7)


Checking for missing values:
Province.State    51965
Country.Region        0
Lat                   0
Long                  0
date                  0
cases                 0
type                  0
dtype: int64


Checking data type of each column:
Province.State     object
Country.Region     object
Lat               float64
Long              float64
date               object
cases               int64
type               object
dtype: object


In [9]:
df_confirmed = raw_data_all[raw_data_all['type'] == 'confirmed']
#df_selected['cumulative'] = df_selected.groupby(['coords'])['cases'] \
#.apply(lambda x: x.cumsum())


In [10]:
print("Basic Information")
print("Totol number of countries with Disease Spread: ",len(raw_data_all["Country.Region"].unique()))

Basic Information
Totol number of countries with Disease Spread:  185


## Spread of COVID-19 around the World

[Solution for datetime in plot animation](https://github.com/plotly/plotly.py/issues/1737)

In [11]:
# Copy dataframe, as dfs are mutable
# https://stackoverflow.com/questions/27673231/why-should-i-make-a-copy-of-a-data-frame-in-pandas
df_coords = raw_data_all.copy()

#df_coords['coords'] = df_coords[['Long', 'Lat']].values.tolist()
df_coords['coords'] = tuple(list(zip(df_coords['Lat'], df_coords['Long'])))

# convert to string, otherwise not hashable & can't be used for comparison
# https://stackoverflow.com/questions/14535730/what-does-hashable-mean-in-python
df_coords['coords'] = df_coords['coords'].astype('str')
df_coords['coords'] = df_coords['coords'].apply(lambda x: make_tuple(x))

#make_tuple(df_coords['coords'][0])
#df_coords.drop(['Province.State', 'Country.Region'], axis=1, inplace=True)

In [12]:
df_coords.head()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type,coords
0,,Afghanistan,33.0,65.0,2020-01-22,0,confirmed,"(33.0, 65.0)"
1,,Afghanistan,33.0,65.0,2020-01-23,0,confirmed,"(33.0, 65.0)"
2,,Afghanistan,33.0,65.0,2020-01-24,0,confirmed,"(33.0, 65.0)"
3,,Afghanistan,33.0,65.0,2020-01-25,0,confirmed,"(33.0, 65.0)"
4,,Afghanistan,33.0,65.0,2020-01-26,0,confirmed,"(33.0, 65.0)"


In [13]:
print(df_coords.dtypes)

Province.State     object
Country.Region     object
Lat               float64
Long              float64
date               object
cases               int64
type               object
coords             object
dtype: object


In [14]:
selected_type = 'confirmed'

#print(df_coords.shape)
#print(df_coords[df_coords['type'] == selected_type].shape)

df_selected = df_coords[df_coords['type'] == selected_type]

#print(len(df_selected['coords'].unique()))

df_selected['cumulative'] = df_selected.groupby(['coords'])['cases'].apply(lambda x: x.cumsum())





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [17]:
df_selected.tail()

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type,coords,cumulative
25075,Zhejiang,China,29.1832,120.0934,2020-04-21,0,confirmed,"(29.1832, 120.0934)",1268
25076,Zhejiang,China,29.1832,120.0934,2020-04-22,0,confirmed,"(29.1832, 120.0934)",1268
25077,Zhejiang,China,29.1832,120.0934,2020-04-23,0,confirmed,"(29.1832, 120.0934)",1268
25078,Zhejiang,China,29.1832,120.0934,2020-04-24,0,confirmed,"(29.1832, 120.0934)",1268
25079,Zhejiang,China,29.1832,120.0934,2020-04-25,0,confirmed,"(29.1832, 120.0934)",1268


In [20]:
df_selected[df_selected['cumulative'] == -1]

Unnamed: 0,Province.State,Country.Region,Lat,Long,date,cases,type,coords,cumulative
19471,Grand Princess,Canada,37.6489,-122.6655,2020-04-22,-14,confirmed,"(37.6489, -122.6655)",-1
19472,Grand Princess,Canada,37.6489,-122.6655,2020-04-23,0,confirmed,"(37.6489, -122.6655)",-1
19473,Grand Princess,Canada,37.6489,-122.6655,2020-04-24,0,confirmed,"(37.6489, -122.6655)",-1
19474,Grand Princess,Canada,37.6489,-122.6655,2020-04-25,0,confirmed,"(37.6489, -122.6655)",-1
22418,Nova Scotia,Canada,44.682,-63.7443,2020-04-24,-828,confirmed,"(44.681999999999995, -63.7443)",-1
22419,Nova Scotia,Canada,44.682,-63.7443,2020-04-25,0,confirmed,"(44.681999999999995, -63.7443)",-1


In [16]:
# TODO: update styles
# https://plotly.com/python-api-reference/generated/plotly.express.scatter_geo.html

fig = px.scatter_geo(df_selected,
                     lat='Lat', lon='Long',
                     size='cumulative',
                     animation_frame='date'
                    )

fig.update_layout(
    title_text = 'Spread of Coronavirus around the world'
)

fig.show()

ValueError: 
    Invalid element(s) received for the 'size' property of scattergeo.marker
        Invalid elements include: [-1]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above

In [None]:
# Change animation speed
# https://community.plotly.com/t/how-to-slow-down-animation-in-plotly-express/31309/5

fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 300

## Clean Up Province vs Country

**Goal:** only Country should remain to specify location

In [None]:
# TODO: write cleaning function

In [None]:
# Create temporary data frame: Select all row where 'Province.State' == NaN
temp_df_notna = raw_data_all[pd.notna(raw_data_all['Province.State'])]

In [None]:
temp_df_notna.shape

**Provinces:**
Sum up data in Provinces of China and Australia

In [None]:
# Create temporary data frame
temp_df_province = temp_df_notna[(temp_df_notna['Country.Region'] == 'China') | (temp_df_notna['Country.Region'] == 'Australia')]
#temp_df_province.head()
temp_df_province.info()

## Cases on Choropleth Map

In [None]:
df_coords.head()

In [None]:
selected_type = 'confirmed'

#print(df_coords.shape)
#print(df_coords[df_coords['type'] == selected_type].shape)

df_choropleth = df_coords[df_coords['type'] == selected_type]

#print(len(df_selected['coords'].unique()))

df_choropleth['cumulative'] = df_choropleth.groupby(['Country.Region'])['cases'].apply(lambda x: x.cumsum())




In [None]:
# your color-scale
scl = [[0.0, '#ffffff'],[0.2, '#b4a8ce'],[0.4, '#8573a9'],
       [0.6, '#7159a3'],[0.8, '#5732a1'],[1.0, '#2c0579']] # purples

In [None]:
# min year in your dataset
date = df_choropleth['date'].min()

In [None]:
data_slider = []
for year in df_choropleth['date'].unique():
    df_segmented =  df_choropleth[(df_choropleth['date']== date)]

    for col in df_segmented.columns:
        df_segmented[col] = df_segmented[col].astype(str)

    data_each_day = dict(
                        type='choropleth',
                        locations = df_segmented['Country.Region'],
                        z=df_segmented['cumulative'].astype(float),
                        #locationmode='USA-states',
                        colorscale = scl,
                        colorbar= {'title':'# Cases'})

    data_slider.append(data_each_day)

In [None]:
#data_slider

In [None]:
steps = []
for i in range(len(data_slider)):
    step = dict(method='restyle',
                args=['visible', [False] * len(data_slider)],
                label='{}'.format(i + 1998))
    step['args'][1][i] = True
    steps.append(step)


In [None]:
sliders = [dict(active=0, pad={"t": 1}, steps=steps)]

In [None]:
df_choropleth[df_choropleth['date'] == df_choropleth['date'].max()].head()

In [None]:
df_choropleth[df_choropleth['date'] == df_choropleth['date'].max()]

In [None]:
#layout = dict(title ='UFO Sightings by State Since 1998', geo=dict(#scope='usa',
                       #projection={'type': 'equirectangular'}),
              #sliders=sliders)

#fig = dict(data=data_slider, layout=layout)
loc_series = df_choropleth[df_choropleth['date'] == df_choropleth['date'].max()]['cumulative']
fig = go.Figure(data=go.Choropleth(
    locations=df_choropleth[df_choropleth['date'] == df_choropleth['date'].max()]['Country.Region'],
    z=loc_series.astype(float),
    colorscale='Reds',
    colorbar={'title': '# Cases'}
))

fig.update_layout(
    title_text='COVID-19 Cumulative Cases',
    #geo_scope='usa',  # limite map scope to USA
)

fig.show()

In [None]:
import reverse_geocoder as rg 
import pprint

def reverse_geocode(coordinates):
    result = rg.search(coordinates)
    
    # result is a list containing ordered dictionary. 
    pprint.pprint(result)

In [None]:
df_coords.head()

In [None]:
type(df_coords['coords'][0])

In [None]:
reverse_geocode(df_coords['coords'][0])

In [None]:
from arcgis.geocoding import reverse_geocode

In [None]:
#results = reverse_geocode([2.2945, 48.8583])

## Descriptive Analysis

In [None]:
confirmed_country.head()

In [None]:
# Plot time series of several countries of interest
poi = ['China', 'US', 'Italy', 'France', 'Spain', 'Australia']
#confirmed_country[poi].plot(figsize=(20,10), linewidth=5, colormap='brg', fontsize=20);

In [None]:
confirmed_long = confirmed_country.reset_index().melt(id_vars='index', value_name='Confirmed').rename(columns={'index':'Day'})


confirmed_long.head()

In [None]:
import altair as alt
from altair_transform import transform_chart

In [None]:
# altair plot 
chart = alt.Chart(confirmed_long).mark_line().encode(
    x='Day',
    y='Confirmed',
    color='Country/Region')


In [None]:
new_chart = transform_chart(chart)
new_chart.data