In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.utils.data as Data
%matplotlib inline

# Read the data
The first step of any data science program is to analyze the data, so what we need to do is to get the insights of the data because that's pretty crucial.
First thing we are gonna do is use pandas to read the data, and analyze it.

In [64]:
df = pd.read_csv('2019_nCoV_data.csv')
df.head(5)

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020 12:00:00,Anhui,China,2020-01-22 12:00:00,1.0,0.0,0.0
1,2,01/22/2020 12:00:00,Beijing,China,2020-01-22 12:00:00,14.0,0.0,0.0
2,3,01/22/2020 12:00:00,Chongqing,China,2020-01-22 12:00:00,6.0,0.0,0.0
3,4,01/22/2020 12:00:00,Fujian,China,2020-01-22 12:00:00,1.0,0.0,0.0
4,5,01/22/2020 12:00:00,Gansu,China,2020-01-22 12:00:00,0.0,0.0,0.0


In [65]:
df.describe()

Unnamed: 0,Sno,Confirmed,Deaths,Recovered
count,700.0,700.0,700.0,700.0
mean,350.5,141.224286,3.077143,3.407143
std,202.21688,859.834237,27.759694,22.586377
min,1.0,0.0,0.0,0.0
25%,175.75,2.0,0.0,0.0
50%,350.5,8.0,0.0,0.0
75%,525.25,55.25,0.0,1.0
max,700.0,13522.0,414.0,396.0


In [66]:
df.isna().sum()

Sno                 0
Date                0
Province/State    164
Country             0
Last Update         0
Confirmed           0
Deaths              0
Recovered           0
dtype: int64

# Get Maximum Confirmed death records
We will take a look at those places where there are maximum confirmed kills by this deadly virus.
Both Countrywise and Province/State wise 

In [68]:
df[df['Confirmed']>141.224286]

Unnamed: 0,Sno,Date,Province/State,Country,Last Update,Confirmed,Deaths,Recovered
13,14,01/22/2020 12:00:00,Hubei,China,2020-01-22 12:00:00,444.0,0.0,0.0
51,52,01/23/2020 12:00:00,Hubei,Mainland China,2020-01-23 12:00:00,444.0,17.0,28.0
83,84,01/24/2020 12:00:00,Hubei,Mainland China,2020-01-24 12:00:00,549.0,24.0,31.0
124,125,01/25/2020 22:00:00,Hubei,Mainland China,2020-01-25 22:00:00,1052.0,52.0,42.0
168,169,01/26/2020 23:00:00,Hubei,Mainland China,2020-01-26 23:00:00,1423.0,76.0,44.0
...,...,...,...,...,...,...,...,...
642,643,02/03/2020 21:40:00,Beijing,Mainland China,2020-03-02 09:23:00,212.0,1.0,12.0
643,644,02/03/2020 21:40:00,Shanghai,Mainland China,2020-04-02 00:13:00,208.0,1.0,10.0
644,645,02/03/2020 21:40:00,Fujian,Mainland China,2020-03-02 11:33:00,179.0,0.0,1.0
645,646,02/03/2020 21:40:00,Heilongjiang,Mainland China,2020-04-02 00:53:00,155.0,2.0,2.0


In [69]:
df[df['Confirmed']>141.224286].Country.unique()

array(['China', 'Mainland China'], dtype=object)

In [70]:
df[df['Confirmed']>141.224286]["Province/State"].unique()

array(['Hubei', 'Guangdong', 'Zhejiang', 'Henan', 'Hunan', 'Anhui',
       'Chongqing', 'Jiangxi', 'Shandong', 'Sichuan', 'Jiangsu',
       'Shanghai', 'Beijing', 'Fujian', 'Heilongjiang', 'Shaanxi'],
      dtype=object)

In [71]:
regionwise = df.groupby(["Province/State"])['Confirmed'].mean().reset_index()
regionwise = regionwise.sort_values('Confirmed', ascending=False)
regionwise

Unnamed: 0,Province/State,Confirmed
20,Hubei,4730.615385
55,Zhejiang,345.923077
12,Guangdong,305.461538
18,Henan,255.384615
21,Hunan,234.230769
0,Anhui,178.076923
25,Jiangxi,164.153846
9,Chongqing,154.923077
44,Sichuan,123.230769
41,Shandong,121.615385


In [72]:
regionwise.describe()

Unnamed: 0,Confirmed
count,56.0
mean,135.165301
std,630.472891
min,0.75
25%,1.96875
50%,12.307692
75%,68.711538
max,4730.615385


In [73]:
regionwise_upmean = regionwise[regionwise['Confirmed'] > 135.165301]
regionwise_upmean

Unnamed: 0,Province/State,Confirmed
20,Hubei,4730.615385
55,Zhejiang,345.923077
12,Guangdong,305.461538
18,Henan,255.384615
21,Hunan,234.230769
0,Anhui,178.076923
25,Jiangxi,164.153846
9,Chongqing,154.923077


In [None]:
# plt.figure(figsize=(432,324))
plt.bar(regionwise_upmean['Province/State'].values, regionwise_upmean['Confirmed'].values)
# plt.show()
plt.savefig('Corona.png')

In [74]:
regionwise_downmean = regionwise[regionwise['Confirmed']<135.165301]
regionwise_downmean

Unnamed: 0,Province/State,Confirmed
44,Sichuan,123.230769
41,Shandong,121.615385
24,Jiangsu,118.615385
3,Beijing,101.461538
42,Shanghai,95.692308
10,Fujian,79.153846
13,Guangxi,65.230769
40,Shaanxi,56.923077
16,Hebei,54.0
54,Yunnan,49.0


In [75]:
df[df['Province/State'] =='Hubei']['Country']

13              China
51     Mainland China
83     Mainland China
124    Mainland China
168    Mainland China
215    Mainland China
267    Mainland China
319    Mainland China
375    Mainland China
434    Mainland China
497    Mainland China
564    Mainland China
631    Mainland China
Name: Country, dtype: object

# Analysis of the Data
What we can infer from the above Analysis is the following points:
* <mark>Hubei</mark> in China is the most affected State/Province by this virus with an average confirmed death toll of about <mark>4730.615385</mark>
* There are <mark>8 Provinces/States in China</mark> which are most affected by this Corona Virus. 
* Most of the European State/Province are less affected by this virus.
* Tibet is the least affected State/Province in our set of data.

In [76]:
df['Country'].loc[df['Province/State'].isna()]

35           Japan
36        Thailand
37     South Korea
73           Japan
74        Thailand
          ...     
689        Finland
691          Nepal
692          Spain
693      Sri Lanka
694         Sweden
Name: Country, Length: 164, dtype: object