In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [48]:
corona = pd.read_csv('datasets/2019_nCoV_20200121_20200206.csv')
corona.sample(10)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Suspected,Recovered,Death
1496,Illinois,United States,1/26/2020 23:00,1.0,,,
1535,Guizhou,Mainland China,1/26/2020 11:00 AM,5.0,,,
195,,Belgium,2/4/20 15:43,1.0,,0.0,0.0
729,Ontario,Canada,2/1/2020 18:12,3.0,,0.0,0.0
1473,Jiangxi,Mainland China,1/26/2020 23:00,48.0,,,
917,,South Korea,1/30/2020 21:30,6.0,,,
838,Guizhou,Mainland China,1/31/2020 14:00,29.0,,2.0,
1578,Inner Mongolia,Mainland China,1/25/2020 12:00 PM,7.0,,,
1537,Ningxia,Mainland China,1/26/2020 11:00 AM,4.0,,,
1257,New South Wales,Australia,1/28/2020 18:00,4.0,,,


In [49]:
def data_inv(df):
    print('Number of Rows: ', df.shape[0])
    print('Number of Columns: ', df.shape[1])
    print('-' * 30)
    print('Dataset Columns: \n')
    print(df.columns)
    print('-' * 30)
    print('Datatype of Each Column: \n')
    print(df.dtypes)
    print('-' * 30)
    print('Missing Rows in Each Column: \n')
    c=df.isnull().sum()
    print(c[c > 0])
data_inv(corona)

Number of Rows:  1877
Number of Columns:  7
------------------------------
Dataset Columns: 

Index(['Province/State', 'Country/Region', 'Last Update', 'Confirmed',
       'Suspected', 'Recovered', 'Death'],
      dtype='object')
------------------------------
Datatype of Each Column: 

Province/State     object
Country/Region     object
Last Update        object
Confirmed         float64
Suspected         float64
Recovered         float64
Death             float64
dtype: object
------------------------------
Missing Rows in Each Column: 

Province/State     459
Confirmed           30
Suspected         1789
Recovered          876
Death              999
dtype: int64


## Dropping Columns
- 'Suspected' has 95% missing values (1789/1877)

In [50]:
corona.drop('Suspected', inplace=True, axis=1)
corona.head()

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Recovered,Death
0,Hubei,Mainland China,2/5/20 16:43,16678.0,538.0,479.0
1,Guangdong,Mainland China,2/5/20 13:23,895.0,49.0,0.0
2,Zhejiang,Mainland China,2/5/20 15:13,895.0,78.0,0.0
3,Henan,Mainland China,2/5/20 15:03,764.0,47.0,2.0
4,Hunan,Mainland China,2/5/20 15:23,661.0,54.0,0.0


## Change the Index
- No fields are unique, so we'll stick with the provided index. This may need to be remade in case any rows need removal later. 

In [51]:
for column in corona.columns:
    if corona[column].is_unique is True:
        print(f"{column} is unique")
    else:
        print(f"{column} is NOT unique")

Province/State is NOT unique
Country/Region is NOT unique
Last Update is NOT unique
Confirmed is NOT unique
Recovered is NOT unique
Death is NOT unique


## Clean-up Fields
- 'Last Update' will need to be in datetime
- 'Mainland China' in 'Country/Region' needs to be changed to just 'China'

In [52]:
corona['Country/Region'].value_counts()

Mainland China          1028
United States            154
Australia                 81
Canada                    50
South Korea               34
Taiwan                    34
Thailand                  34
Hong Kong                 34
Japan                     34
Macau                     33
Singapore                 32
Vietnam                   32
Malaysia                  30
France                    30
Nepal                     29
Cambodia                  24
Sri Lanka                 23
Germany                   22
United Arab Emirates      18
Finland                   17
Philippines               16
India                     15
Italy                     14
Sweden                    13
UK                        13
Russia                    13
Spain                     12
Belgium                    3
Ivory Coast                2
Colombia                   1
Brazil                     1
Mexico                     1
Name: Country/Region, dtype: int64

In [53]:
corona.replace(to_replace='Mainland China', value='China', inplace=True)
corona['Country/Region'].value_counts().head()

China            1028
United States     154
Australia          81
Canada             50
Japan              34
Name: Country/Region, dtype: int64

In [54]:
corona['Country/Region'].nunique()

32

In [59]:
corona['Confirmed'].fillna(0, inplace=True)
corona['Recovered'].fillna(0, inplace=True)
corona['Death'].fillna(0, inplace=True)
corona['Province/State'].fillna('Unknown', inplace=True)
corona.sample(5)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Recovered,Death
75,Jiangxi,China,2/5/20 1:33,548.0,27.0,0.0
1390,Ningxia,China,1/27/2020 19:00,7.0,0.0,0.0
68,"Seattle, WA",United States,2/1/20 19:43,1.0,0.0,0.0
1831,Liaoning,China,1/22/2020 12:00,2.0,0.0,0.0
1444,Taiwan,Taiwan,1/27/2020 9:00,5.0,0.0,0.0


In [60]:
corona['Last Update'] = pd.to_datetime(corona['Last Update'])
corona.sample(10)

Unnamed: 0,Province/State,Country/Region,Last Update,Confirmed,Recovered,Death
7,Chongqing,China,2020-02-05 09:13:00,376.0,15.0,2.0
313,Unknown,South Korea,2020-02-02 02:23:00,15.0,0.0,0.0
494,Shanghai,China,2020-02-03 00:03:00,193.0,10.0,1.0
622,Anhui,China,2020-02-02 01:33:00,340.0,5.0,0.0
1080,Taiwan,Taiwan,2020-01-29 14:30:00,8.0,0.0,0.0
604,British Columbia,Canada,2020-02-01 18:12:00,1.0,0.0,0.0
620,Henan,China,2020-02-02 00:53:00,493.0,4.0,2.0
915,Unknown,Japan,2020-01-30 21:30:00,11.0,1.0,0.0
1489,Jilin,China,2020-01-26 23:00:00,6.0,0.0,0.0
439,Tianjin,China,2020-02-02 09:43:00,48.0,1.0,0.0


In [61]:
corona.dtypes

Province/State            object
Country/Region            object
Last Update       datetime64[ns]
Confirmed                float64
Recovered                float64
Death                    float64
dtype: object

## Renaming Columns and Skipping Rows
- 'Province/State' -> 'Division'
- 'Country/Region' -> 'Country'
- 'Last Update' -> 'Date'

In [67]:
new_names = {'Province/State': 'Division',
             'Country/Region': 'Country',
             'Last Update': 'Date'}
corona.rename(columns=new_names, inplace=True)
corona.sample(5)

Unnamed: 0,Division,Country,Date,Confirmed,Recovered,Death
1833,Ningxia,China,2020-01-22 12:00:00,1.0,0.0,0.0
1532,Hong Kong,Hong Kong,2020-01-26 11:00:00,8.0,0.0,0.0
527,Ontario,Canada,2020-02-01 18:12:00,3.0,0.0,0.0
749,"Seattle, WA",United States,2020-02-01 19:43:00,1.0,0.0,0.0
1780,Hunan,China,2020-01-23 12:00:00,9.0,0.0,0.0


### EDA

In [69]:
corona.sort_values(by='Date', inplace=True)
corona.head()

Unnamed: 0,Division,Country,Date,Confirmed,Recovered,Death
1876,Washington,United States,2020-01-21,1.0,0.0,0.0
1850,Shanghai,China,2020-01-21,9.0,0.0,0.0
1851,Yunnan,China,2020-01-21,1.0,0.0,0.0
1852,Beijing,China,2020-01-21,10.0,0.0,0.0
1853,Taiwan,Taiwan,2020-01-21,1.0,0.0,0.0


In [76]:
corona.describe(include='all')

Unnamed: 0,Division,Country,Date,Confirmed,Recovered,Death
count,1877,1877,1877,1877.0,1877.0,1877.0
unique,57,32,207,,,
top,Unknown,China,2020-01-25 12:00:00,,,
freq,459,1028,86,,,
first,,,2020-01-21 00:00:00,,,
last,,,2020-02-05 16:43:00,,,
mean,,,,162.968034,4.599893,3.491209
std,,,,1023.304837,29.317235,31.794978
min,,,,0.0,0.0,0.0
25%,,,,2.0,0.0,0.0


In [84]:
gb = corona.groupby('Country')
gb.get_group('China').head()

Unnamed: 0,Division,Country,Date,Confirmed,Recovered,Death
1850,Shanghai,China,2020-01-21,9.0,0.0,0.0
1851,Yunnan,China,2020-01-21,1.0,0.0,0.0
1852,Beijing,China,2020-01-21,10.0,0.0,0.0
1854,Jilin,China,2020-01-21,0.0,0.0,0.0
1855,Sichuan,China,2020-01-21,2.0,0.0,0.0
