In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# visualization
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from plotnine import *
import plotly.express as px
import folium


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


# color pallette
cdr = ['#393e46', '#ff2e63', '#30e3ca'] # grey - red - blue
idr = ['#f8b400', '#ff2e63', '#30e3ca'] # yellow - red - blue

/kaggle/input/novel-corona-virus-2019-dataset/2019_nCoV_data.csv
/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_confirmed.csv
/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_recovered.csv
/kaggle/input/novel-corona-virus-2019-dataset/COVID19_open_line_list.csv
/kaggle/input/novel-corona-virus-2019-dataset/time_series_covid_19_deaths.csv
/kaggle/input/novel-corona-virus-2019-dataset/covid_19_data.csv
/kaggle/input/novel-corona-virus-2019-dataset/COVID19_line_list_data.csv


# Read Data

In [2]:
COVID19 = pd.read_csv("../input/novel-corona-virus-2019-dataset/2019_nCoV_data.csv")
COVID19.head()

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,01/22/2020 12:00:00,1.0,0.0,0.0
1,2,01/22/2020 12:00:00,Beijing,China,01/22/2020 12:00:00,14.0,0.0,0.0
2,3,01/22/2020 12:00:00,Chongqing,China,01/22/2020 12:00:00,6.0,0.0,0.0
3,4,01/22/2020 12:00:00,Fujian,China,01/22/2020 12:00:00,1.0,0.0,0.0
4,5,01/22/2020 12:00:00,Gansu,China,01/22/2020 12:00:00,0.0,0.0,0.0


# EDA

As you can see there are a lot of missing data Province/State attribute on data set.

In [3]:
COVID19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1719 entries, 0 to 1718
Data columns (total 8 columns):
Sno               1719 non-null int64
Date              1719 non-null object
Province/State    1257 non-null object
Country           1719 non-null object
Last Update       1719 non-null object
Confirmed         1719 non-null float64
Deaths            1719 non-null float64
Recovered         1719 non-null float64
dtypes: float64(3), int64(1), object(4)
memory usage: 107.6+ KB


In [4]:
# checking for missing value
COVID19.isna().sum()

Sno                 0
Date                0
Province/State    462
Country             0
Last Update         0
Confirmed           0
Deaths              0
Recovered           0
dtype: int64


## Cleaning Data

In [5]:
# replacing Mainland china with just China
COVID19['Country'] = COVID19['Country'].replace('Mainland China', 'China')

# filling missing values with NA
COVID19[['Province/State']] = COVID19[['Province/State']].fillna('NA')


## Countries which have been affected by the Coronavirus (2019-nCoV) untill now


In [6]:
# Countries affected

countries = COVID19['Country'].unique().tolist()
print(countries)

print("\nTotal countries affected by virus: ",len(countries))

['China', 'US', 'Japan', 'Thailand', 'South Korea', 'Hong Kong', 'Macau', 'Taiwan', 'Singapore', 'Philippines', 'Malaysia', 'Vietnam', 'Australia', 'Mexico', 'Brazil', 'France', 'Nepal', 'Canada', 'Cambodia', 'Sri Lanka', 'Ivory Coast', 'Germany', 'Finland', 'United Arab Emirates', 'India', 'Italy', 'Sweden', 'Russia', 'Spain', 'UK', 'Belgium', 'Others', 'Egypt']

Total countries affected by virus:  33


## Creating a dataframe with total number of confirmed cases for every country

In [7]:
Number_of_countries = len(COVID19['Country'].value_counts())


situation = pd.DataFrame(COVID19.groupby('Country')['Confirmed'].sum())
situation['Country'] = situation.index
situation.index=np.arange(1, Number_of_countries + 1)

global_cases = situation[['Country','Confirmed']]
global_cases.sort_values(by=['Confirmed'],ascending=False)


Unnamed: 0,Country,Confirmed
6,China,773500.0
20,Others,2161.0
23,Singapore,797.0
11,Hong Kong,681.0
15,Japan,591.0
29,Thailand,581.0
24,South Korea,444.0
28,Taiwan,320.0
17,Malaysia,303.0
1,Australia,284.0


## Creating a dataframe with total number of confirmed cases, deaths and recovered for every countrysides/provinces

In [8]:
provinces_situation = COVID19.groupby(['Country', 'Province/State'])['Confirmed', 'Deaths', 'Recovered'].max()
provinces_situation.style.background_gradient(cmap='viridis')



Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered
Country,Province/State,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Australia,,4,0,0
Australia,New South Wales,4,0,4
Australia,Queensland,5,0,0
Australia,South Australia,2,0,2
Australia,Victoria,4,0,4
Belgium,,1,0,1
Brazil,,0,0,0
Cambodia,,1,0,1
Canada,British Columbia,5,0,0
Canada,"London, ON",1,0,1


## Number of Confirmed and Deaths cases with visual methods

In [9]:
fig = px.bar(COVID19[['Country', 'Confirmed']].sort_values('Confirmed', ascending=False), 
             y="Confirmed", x="Country", color='Country', 
             log_y=True, template='ggplot2', title='Confirmed Cases')
fig.show()

fig = px.bar(COVID19[['Country', 'Deaths']].sort_values('Deaths', ascending=False), 
             y="Deaths", x="Country", color='Country', title='Deaths',
             log_y=True, template='ggplot2')
fig.show()

In [10]:
fig = px.choropleth(COVID19, locations="Country", 
                    locationmode='country names', color="Confirmed", 
                    hover_name="Country", range_color=[1,2000], 
                    color_continuous_scale=px.colors.diverging.Tealrose, 
                    title='Countries with Confirmed Cases')
fig.update(layout_coloraxis_showscale=False)
fig.show()

# ------------------------------------------------------------------------

fig = px.choropleth(COVID19[COVID19['Deaths']>0], 
                    locations="Country", locationmode='country names',
                    color="Deaths", hover_name="Country", 
                    range_color=[1,50], color_continuous_scale=px.colors.sequential.Viridis,
                    title='Countries with Deaths Reported')
fig.update(layout_coloraxis_showscale=False)
fig.show()




# Conclusion

   * If this tutorial is not enough you can check also
        https://www.kaggle.com/imdevskp/covid-19-analysis-viz-prediction-comparisons
        https://www.kaggle.com/parulpandey/wuhan-coronavirus-a-geographical-analysis/data
   * After this tutorial, my aim is to prepare 'kernel' which is connected to Deep Learning 'not clear' data set.
   * If you have any suggestions, please could you write for me? I wil be happy for comment and critics!
   * Thank you for your suggestion and votes ;)

