# **<center><span style="color:#fd7b12;">Imports  </span></center>**

In [1]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
# from pandas_datareader import wb
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from wordcloud import WordCloud
from plotly.subplots import make_subplots

# **<center><span style="color:#fd7b12;">Data Wrangling  </span></center>**

### Importation du dataset

In [2]:
report = pd.read_csv('covid_19_clean_complete.csv',parse_dates=['Date'])
report.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0


### Création de la variable cas actif

In [3]:
report['Active'] = report['Confirmed'] - report['Deaths'] - report['Recovered']
report.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
0,,Afghanistan,33.0,65.0,2020-01-22,0,0,0,0
1,,Albania,41.1533,20.1683,2020-01-22,0,0,0,0
2,,Algeria,28.0339,1.6596,2020-01-22,0,0,0,0
3,,Andorra,42.5063,1.5218,2020-01-22,0,0,0,0
4,,Angola,-11.2027,17.8739,2020-01-22,0,0,0,0


### Remplir les valeurs manquantes

In [4]:
report[['Province/State']] = report[['Province/State']].fillna('')
report.isnull().sum().sum()

0

### Dataset relatif au dernier enregistrement

In [5]:
last_report = report[report['Date'] == max(report['Date'])]
last_report.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
26136,,Afghanistan,33.0,65.0,2020-04-30,2171,64,260,1847
26137,,Albania,41.1533,20.1683,2020-04-30,773,31,470,272
26138,,Algeria,28.0339,1.6596,2020-04-30,4006,450,1779,1777
26139,,Andorra,42.5063,1.5218,2020-04-30,745,42,468,235
26140,,Angola,-11.2027,17.8739,2020-04-30,27,2,7,18


### Dataset relatif au dernier enregistrement/pays

In [6]:
countries_last_report = last_report.groupby('Country/Region')['Confirmed', 'Deaths', 'Recovered', 'Active'].sum().reset_index()
countries_last_report.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active
0,Afghanistan,2171,64,260,1847
1,Albania,773,31,470,272
2,Algeria,4006,450,1779,1777
3,Andorra,745,42,468,235
4,Angola,27,2,7,18


In [7]:
patients = pd.read_csv('COVID19_line_list_data.csv',parse_dates=['exposure_start','exposure_end'])
patients = patients[['country','gender','age','exposure_start','exposure_end','death','recovered','symptom']]
patients.sample(5)

Unnamed: 0,country,gender,age,exposure_start,exposure_end,death,recovered,symptom
484,Singapore,male,64.0,NaT,NaT,0,0,
598,South Korea,male,75.0,NaT,NaT,0,0,
150,China,female,55.0,NaT,NaT,0,0,
966,Italy,male,35.0,NaT,2020-02-03,0,0,
654,South Korea,male,36.0,NaT,NaT,2/25/2020,0,


In [8]:
patients["recovered"].unique()

array(['0', '1', '02/12/20', '1/15/2020', '12/30/1899', '02/08/20',
       '2/14/2020', '02/04/20', '2/18/2020', '02/05/20', '2/17/2020',
       '02/09/20', '2/15/2020', '2/27/2020', '2/19/2020', '2/20/2020',
       '1/17/2020', '02/07/20', '2/21/2020', '2/23/2020', '02/11/20',
       '2/22/2020', '2/16/2020', '2/24/2020', '2/26/2020', '2/25/2020',
       '02/06/20', '2/28/2020', '1/30/2020', '2/13/2020', '02/02/20',
       '1/31/2020'], dtype=object)

In [9]:
patients_recovered = patients.query('recovered!="0"')

In [10]:
patients_recovered['cure_time'] = patients_recovered['exposure_end'] - patients_recovered['exposure_start']
patients_recovered.sample(5)

Unnamed: 0,country,gender,age,exposure_start,exposure_end,death,recovered,symptom,cure_time
526,Singapore,male,35.0,NaT,NaT,0,2/25/2020,,NaT
428,Malaysia,male,11.0,NaT,NaT,0,2/14/2020,,NaT
427,Malaysia,female,65.0,NaT,NaT,0,2/14/2020,,NaT
495,Singapore,male,35.0,NaT,NaT,0,2/24/2020,,NaT
457,Singapore,female,56.0,NaT,2020-01-19,0,2/18/2020,,NaT


In [11]:
patients_deceased = patients.query('death!="0"')
patients_deceased.head()

Unnamed: 0,country,gender,age,exposure_start,exposure_end,death,recovered,symptom
48,China,male,61.0,NaT,NaT,1,0,
49,China,male,69.0,NaT,NaT,1,0,
50,China,male,89.0,NaT,NaT,1,0,
51,China,male,89.0,NaT,NaT,1,0,
52,China,male,66.0,NaT,NaT,1,0,


In [12]:
patients_deceased['time_to_death'] = patients_deceased['exposure_end'] - patients_deceased['exposure_start']
patients_deceased.sample(5)

Unnamed: 0,country,gender,age,exposure_start,exposure_end,death,recovered,symptom,time_to_death
262,Japan,female,85.0,NaT,NaT,2/13/2020,0,fatigue,NaT
102,China,male,79.0,NaT,NaT,1,0,,NaT
291,Japan,male,85.0,NaT,NaT,2/26/2020,0,"fever, pneumonia",NaT
100,China,female,76.0,NaT,NaT,1,0,,NaT
62,China,male,86.0,NaT,NaT,1,0,,NaT


In [13]:
patients_deceased['time_to_death'].unique()

array([          'NaT', 172800000000000], dtype='timedelta64[ns]')

In [14]:
patients_deceased['time_to_death'].value_counts()

2 days    1
Name: time_to_death, dtype: int64

<a id="2"></a>
# **<center><span style="color:#fd7b12;">EDA  </span></center>**

In [15]:
last_report.nunique()

Province/State     81
Country/Region    187
Lat               255
Long              258
Date                1
Confirmed         227
Deaths            102
Recovered         192
Active            188
dtype: int64

In [16]:
last_report.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
26136,,Afghanistan,33.0,65.0,2020-04-30,2171,64,260,1847
26137,,Albania,41.1533,20.1683,2020-04-30,773,31,470,272
26138,,Algeria,28.0339,1.6596,2020-04-30,4006,450,1779,1777
26139,,Andorra,42.5063,1.5218,2020-04-30,745,42,468,235
26140,,Angola,-11.2027,17.8739,2020-04-30,27,2,7,18


In [17]:
report.Date.max()

Timestamp('2020-04-30 00:00:00')

In [18]:
last_report[["Confirmed", "Deaths", "Recovered", "Active"]].sum().to_frame()

Unnamed: 0,0
Confirmed,3256841
Deaths,233388
Recovered,986389
Active,2037064


Les dix pays ayant enregistrer le plus de cas

In [19]:
last_report.sort_values(by = "Confirmed",ascending = False)

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered,Active
26361,,US,37.0902,-95.7129,2020-04-30,1069424,62996,153947,852481
26337,,Spain,40.0000,-4.0000,2020-04-30,213435,24543,112050,76842
26273,,Italy,43.0000,12.0000,2020-04-30,205463,27967,75945,101551
26359,,United Kingdom,55.3781,-3.4360,2020-04-30,171253,26771,0,144482
26252,,France,46.2276,2.2137,2020-04-30,165764,24376,49476,91912
...,...,...,...,...,...,...,...,...,...
26384,Anguilla,United Kingdom,18.2206,-63.0686,2020-04-30,3,0,3,0
26393,Saint Pierre and Miquelon,France,46.8852,-56.3159,2020-04-30,1,0,0,1
26398,,Comoros,-11.6455,43.3333,2020-04-30,1,0,0,1
26214,Tibet,China,31.6927,88.0924,2020-04-30,1,0,1,0


In [20]:
top10 = (last_report.groupby('Country/Region')['Confirmed'].sum().sort_values(ascending=False)).head(10).reset_index()
top10

Unnamed: 0,Country/Region,Confirmed
0,US,1069424
1,Spain,213435
2,Italy,205463
3,United Kingdom,172481
4,France,167299
5,Germany,163009
6,Turkey,120204
7,Russia,106498
8,Iran,94640
9,Brazil,87187


In [22]:
fig = px.bar(top10,x='Country/Region',y='Confirmed',text_auto=True,title='Nombre de cas par pays',color='Confirmed')
fig.update(layout_coloraxis_showscale=False)
fig.show()