# Import Libraries

In [4]:
import pandas as pd
import numpy as np

# Data Description
* dateRep: The date of the reported data
* day: The day of the report (integer, e.g., 1-31).
* month: The month of the report (integer, e.g., 1-12).
* year: The year of the report (integer, e.g., 2020).
* cases: The number of new confirmed cases reported on this date.
* deaths: The number of new deaths reported on this date.
* countriesAndTerritories: The name of the country or territory to which the data belongs.
* geoId: A short geographical identifier (code) for the country or territory.
* countryterritoryCode: The ISO 3166-1 alpha-3 code representing the country or territory.
* popData2018: The population of the country or territory as of 2018.


# Import Data

In [5]:
data=pd.read_csv('ECDC_COVID_19.csv')

In [6]:
data

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0
...,...,...,...,...,...,...,...,...,...,...
9508,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
9509,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
9510,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
9511,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


## Basic Informations

In [7]:
data.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018'],
      dtype='object')

In [8]:
data.shape

(9513, 10)

In [9]:
data.size

95130

In [10]:
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0


In [11]:
data.tail()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018
9508,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
9509,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0
9510,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0
9511,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0
9512,2020-03-21,21,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9513 entries, 0 to 9512
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   dateRep                  9513 non-null   object 
 1   day                      9513 non-null   int64  
 2   month                    9513 non-null   int64  
 3   year                     9513 non-null   int64  
 4   cases                    9513 non-null   int64  
 5   deaths                   9513 non-null   int64  
 6   countriesAndTerritories  9513 non-null   object 
 7   geoId                    9488 non-null   object 
 8   countryterritoryCode     9324 non-null   object 
 9   popData2018              9370 non-null   float64
dtypes: float64(1), int64(5), object(4)
memory usage: 743.3+ KB


In [13]:
data.describe()

Unnamed: 0,day,month,year,cases,deaths,popData2018
count,9513.0,9513.0,9513.0,9513.0,9513.0,9370.0
mean,15.548828,2.591822,2019.992957,146.314517,8.564911,64311960.0
std,9.427045,1.284901,0.083631,1141.114073,71.611262,201564200.0
min,1.0,1.0,2019.0,-9.0,0.0,1000.0
25%,7.0,2.0,2020.0,0.0,0.0,3545883.0
50%,16.0,3.0,2020.0,0.0,0.0,10627160.0
75%,24.0,3.0,2020.0,13.0,0.0,42723140.0
max,31.0,12.0,2020.0,34272.0,2004.0,1392730000.0


## Checking null values

In [14]:
data.isnull().sum()

dateRep                      0
day                          0
month                        0
year                         0
cases                        0
deaths                       0
countriesAndTerritories      0
geoId                       25
countryterritoryCode       189
popData2018                143
dtype: int64

## Checking uniquenes

In [15]:
data.nunique()

dateRep                    100
day                         31
month                        5
year                         2
cases                      728
deaths                     222
countriesAndTerritories    204
geoId                      203
countryterritoryCode       199
popData2018                199
dtype: int64

## Checking duplicates

In [16]:
data.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9508    False
9509    False
9510    False
9511    False
9512    False
Length: 9513, dtype: bool

In [17]:
data.duplicated().sum()

0

No duplicate data

## Convert datatype

In [18]:
data.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018'],
      dtype='object')

In [19]:
data['dateRep']=pd.to_datetime(data['dateRep'])

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9513 entries, 0 to 9512
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   dateRep                  9513 non-null   datetime64[ns]
 1   day                      9513 non-null   int64         
 2   month                    9513 non-null   int64         
 3   year                     9513 non-null   int64         
 4   cases                    9513 non-null   int64         
 5   deaths                   9513 non-null   int64         
 6   countriesAndTerritories  9513 non-null   object        
 7   geoId                    9488 non-null   object        
 8   countryterritoryCode     9324 non-null   object        
 9   popData2018              9370 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(3)
memory usage: 743.3+ KB


## Feature Engineering

In [21]:
data['Day_name']=data['dateRep'].dt.day_name()

In [22]:
data['Month_name']=data['dateRep'].dt.month_name()

In [23]:
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April


In [24]:
data['countriesAndTerritories'].unique()

array(['Afghanistan', 'Albania', 'Andorra', 'Algeria', 'Angola',
       'Anguilla', 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba', 'Bosnia_and_Herzegovina',
       'Botswana', 'Brazil', 'British_Virgin_Islands',
       'Brunei_Darussalam', 'Bulgaria', 'Burkina_Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Canada', 'Cape_Verde',
       'Cases_on_an_international_conveyance_Japan', 'Cayman_Islands',
       'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Congo', 'Costa_Rica', 'Cote_dIvoire', 'Croatia', 'Cuba',
       'CuraÃ§ao', 'Cyprus', 'Czechia',
       'Democratic_Republic_of_the_Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',
       'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Es

In [25]:
data['countriesAndTerritories'].value_counts()

Italy                                100
Belgium                              100
China                                100
Canada                               100
Czechia                              100
                                    ... 
Sierra_Leone                           8
Bonaire, Saint Eustatius and Saba      7
Malawi                                 6
Falkland_Islands_(Malvinas)            5
South_Sudan                            3
Name: countriesAndTerritories, Length: 204, dtype: int64

In [26]:
data['countriesAndTerritories'].sort_values()

0       Afghanistan
65      Afghanistan
64      Afghanistan
63      Afghanistan
62      Afghanistan
           ...     
9495       Zimbabwe
9494       Zimbabwe
9511       Zimbabwe
9502       Zimbabwe
9512       Zimbabwe
Name: countriesAndTerritories, Length: 9513, dtype: object

In [27]:
data[['countriesAndTerritories','cases']].sort_values(by='cases',ascending=False)

Unnamed: 0,countriesAndTerritories,cases
9191,United_States_of_America,34272
9192,United_States_of_America,32425
9188,United_States_of_America,30613
9189,United_States_of_America,30561
9193,United_States_of_America,28819
...,...,...
5885,Monaco,0
5884,Monaco,0
5883,Monaco,0
5882,Monaco,0


In [28]:
data.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018', 'Day_name', 'Month_name'],
      dtype='object')

## isin()

## Filter the dataset to include only the rows where the countriesAndTerritories column is 'Taiwan'?

In [29]:
tw=data[data['countriesAndTerritories'].isin(['Taiwan'])]
tw

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
8593,2020-04-08,8,4,2020,3,0,Taiwan,TW,TWN,23780452.0,Wednesday,April
8594,2020-04-07,7,4,2020,10,0,Taiwan,TW,TWN,23780452.0,Tuesday,April
8595,2020-04-06,6,4,2020,0,0,Taiwan,TW,TWN,23780452.0,Monday,April
8596,2020-04-05,5,4,2020,8,0,Taiwan,TW,TWN,23780452.0,Sunday,April
8597,2020-04-04,4,4,2020,16,0,Taiwan,TW,TWN,23780452.0,Saturday,April
...,...,...,...,...,...,...,...,...,...,...,...,...
8686,2020-01-04,4,1,2020,0,0,Taiwan,TW,TWN,23780452.0,Saturday,January
8687,2020-01-03,3,1,2020,0,0,Taiwan,TW,TWN,23780452.0,Friday,January
8688,2020-01-02,2,1,2020,0,0,Taiwan,TW,TWN,23780452.0,Thursday,January
8689,2020-01-01,1,1,2020,0,0,Taiwan,TW,TWN,23780452.0,Wednesday,January


In [30]:
wed=data[data['Day_name'].isin(['Wednesday'])]
wed.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
7,2020-04-01,1,4,2020,25,0,Afghanistan,AF,AFG,37172386.0,Wednesday,April
14,2020-03-25,25,3,2020,2,0,Afghanistan,AF,AFG,37172386.0,Wednesday,March
21,2020-03-18,18,3,2020,1,0,Afghanistan,AF,AFG,37172386.0,Wednesday,March
25,2020-03-11,11,3,2020,3,0,Afghanistan,AF,AFG,37172386.0,Wednesday,March


## Filter the dataset to include only rows where the 'countriesAndTerritories' column is either 'United_States_of_America' or 'India'?

In [31]:
filt_data=data[data['countriesAndTerritories'].isin(['United_States_of_America','India'])]
filt_data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
4000,2020-04-08,8,4,2020,773,35,India,IN,IND,1352617000.0,Wednesday,April
4001,2020-04-07,7,4,2020,354,5,India,IN,IND,1352617000.0,Tuesday,April
4002,2020-04-06,6,4,2020,693,32,India,IN,IND,1352617000.0,Monday,April
4003,2020-04-05,5,4,2020,472,9,India,IN,IND,1352617000.0,Sunday,April
4004,2020-04-04,4,4,2020,601,12,India,IN,IND,1352617000.0,Saturday,April


In [32]:
filt_data['countriesAndTerritories'].unique()

array(['India', 'United_States_of_America'], dtype=object)

## Filter the DataFrame to include only the rows where the month column is either January or February?

In [33]:
data['Month_name'].unique()

array(['April', 'March', 'February', 'January', 'December'], dtype=object)

In [34]:
filt_month=data[data['Month_name'].isin(['January','February'])]
filt_month.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
29,2020-02-29,29,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,February
30,2020-02-28,28,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Friday,February
31,2020-02-27,27,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Thursday,February
32,2020-02-26,26,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Wednesday,February
33,2020-02-25,25,2,2020,1,0,Afghanistan,AF,AFG,37172386.0,Tuesday,February


## Select rows where the geoId is not in a list of excluded codes (excluded_codes = ['US', 'IN', 'BR'])?

In [35]:
data.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018', 'Day_name', 'Month_name'],
      dtype='object')

In [36]:
data['geoId'].unique()

array(['AF', 'AL', 'AD', 'DZ', 'AO', 'AI', 'AG', 'AR', 'AM', 'AW', 'AU',
       'AT', 'AZ', 'BS', 'BH', 'BD', 'BB', 'BY', 'BE', 'BZ', 'BJ', 'BM',
       'BT', 'BO', 'BQ', 'BA', 'BW', 'BR', 'VG', 'BN', 'BG', 'BF', 'BI',
       'KH', 'CM', 'CA', 'CV', 'JPG11668', 'KY', 'CF', 'TD', 'CL', 'CN',
       'CO', 'CG', 'CR', 'CI', 'HR', 'CU', 'CW', 'CY', 'CZ', 'CD', 'DK',
       'DJ', 'DM', 'DO', 'EC', 'EG', 'SV', 'GQ', 'ER', 'EE', 'SZ', 'ET',
       'FK', 'FO', 'FJ', 'FI', 'FR', 'PF', 'GA', 'GM', 'GE', 'DE', 'GH',
       'GI', 'EL', 'GL', 'GD', 'GU', 'GT', 'GG', 'GN', 'GW', 'GY', 'HT',
       'VA', 'HN', 'HU', 'IS', 'IN', 'ID', 'IR', 'IQ', 'IE', 'IM', 'IL',
       'IT', 'JM', 'JP', 'JE', 'JO', 'KZ', 'KE', 'XK', 'KW', 'KG', 'LA',
       'LV', 'LB', 'LR', 'LY', 'LI', 'LT', 'LU', 'MG', 'MW', 'MY', 'MV',
       'ML', 'MT', 'MR', 'MU', 'MX', 'MD', 'MC', 'MN', 'ME', 'MS', 'MA',
       'MZ', 'MM', nan, 'NP', 'NL', 'NC', 'NZ', 'NI', 'NE', 'NG', 'MK',
       'MP', 'NO', 'OM', 'PK', 'PS', 'PA', 'PG', 'PY

In [37]:
excluded_codes = ['US', 'IN', 'BR']

In [38]:
ex_code=data[~ data['geoId'].isin(excluded_codes)]
ex_code.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April


# Filter out rows where the countriesAndTerritories column contains 'Germany', 'France', or 'Italy'?

In [39]:
excluded_countries = ['Germany', 'France', 'Italy']
ex_country=data[~data['countriesAndTerritories'].isin(excluded_countries)]

In [40]:
ex_country.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April


In [41]:
data['countriesAndTerritories'].unique()

array(['Afghanistan', 'Albania', 'Andorra', 'Algeria', 'Angola',
       'Anguilla', 'Antigua_and_Barbuda', 'Argentina', 'Armenia', 'Aruba',
       'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain',
       'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Benin',
       'Bermuda', 'Bhutan', 'Bolivia',
       'Bonaire, Saint Eustatius and Saba', 'Bosnia_and_Herzegovina',
       'Botswana', 'Brazil', 'British_Virgin_Islands',
       'Brunei_Darussalam', 'Bulgaria', 'Burkina_Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Canada', 'Cape_Verde',
       'Cases_on_an_international_conveyance_Japan', 'Cayman_Islands',
       'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia',
       'Congo', 'Costa_Rica', 'Cote_dIvoire', 'Croatia', 'Cuba',
       'CuraÃ§ao', 'Cyprus', 'Czechia',
       'Democratic_Republic_of_the_Congo', 'Denmark', 'Djibouti',
       'Dominica', 'Dominican_Republic', 'Ecuador', 'Egypt',
       'El_Salvador', 'Equatorial_Guinea', 'Eritrea', 'Es

# reset_index()

## Sort index of filt_month data

In [42]:
filt_month=filt_month.reset_index()

In [43]:
filt_month

Unnamed: 0,index,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,29,2020-02-29,29,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,February
1,30,2020-02-28,28,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Friday,February
2,31,2020-02-27,27,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Thursday,February
3,32,2020-02-26,26,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Wednesday,February
4,33,2020-02-25,25,2,2020,1,0,Afghanistan,AF,AFG,37172386.0,Tuesday,February
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4015,9467,2020-01-05,5,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Sunday,January
4016,9468,2020-01-04,4,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Saturday,January
4017,9469,2020-01-03,3,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Friday,January
4018,9470,2020-01-02,2,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Thursday,January


* Here you can see two index 1st one is initial one and 2 nd one is that we built right now .
* So i need to make my current index as my original one .  





In [44]:
filt_month=data[data['Month_name'].isin(['January','February'])]
filt_month.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
29,2020-02-29,29,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,February
30,2020-02-28,28,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Friday,February
31,2020-02-27,27,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Thursday,February
32,2020-02-26,26,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Wednesday,February
33,2020-02-25,25,2,2020,1,0,Afghanistan,AF,AFG,37172386.0,Tuesday,February


In [45]:
filt_month=filt_month.reset_index(drop=True)

In [46]:
filt_month

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-02-29,29,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,February
1,2020-02-28,28,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Friday,February
2,2020-02-27,27,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Thursday,February
3,2020-02-26,26,2,2020,0,0,Afghanistan,AF,AFG,37172386.0,Wednesday,February
4,2020-02-25,25,2,2020,1,0,Afghanistan,AF,AFG,37172386.0,Tuesday,February
...,...,...,...,...,...,...,...,...,...,...,...,...
4015,2020-01-05,5,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Sunday,January
4016,2020-01-04,4,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Saturday,January
4017,2020-01-03,3,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Friday,January
4018,2020-01-02,2,1,2020,0,0,Vietnam,VN,VNM,95540395.0,Thursday,January


## Customizing Index

In [47]:
filt_month1=filt_month.set_index('countriesAndTerritories')

In [48]:
filt_month1

Unnamed: 0_level_0,dateRep,day,month,year,cases,deaths,geoId,countryterritoryCode,popData2018,Day_name,Month_name
countriesAndTerritories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,2020-02-29,29,2,2020,0,0,AF,AFG,37172386.0,Saturday,February
Afghanistan,2020-02-28,28,2,2020,0,0,AF,AFG,37172386.0,Friday,February
Afghanistan,2020-02-27,27,2,2020,0,0,AF,AFG,37172386.0,Thursday,February
Afghanistan,2020-02-26,26,2,2020,0,0,AF,AFG,37172386.0,Wednesday,February
Afghanistan,2020-02-25,25,2,2020,1,0,AF,AFG,37172386.0,Tuesday,February
...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,2020-01-05,5,1,2020,0,0,VN,VNM,95540395.0,Sunday,January
Vietnam,2020-01-04,4,1,2020,0,0,VN,VNM,95540395.0,Saturday,January
Vietnam,2020-01-03,3,1,2020,0,0,VN,VNM,95540395.0,Friday,January
Vietnam,2020-01-02,2,1,2020,0,0,VN,VNM,95540395.0,Thursday,January


## Displaying changes in original dataset

In [49]:
filt_month.set_index('countriesAndTerritories',inplace=True)

In [50]:
filt_month

Unnamed: 0_level_0,dateRep,day,month,year,cases,deaths,geoId,countryterritoryCode,popData2018,Day_name,Month_name
countriesAndTerritories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,2020-02-29,29,2,2020,0,0,AF,AFG,37172386.0,Saturday,February
Afghanistan,2020-02-28,28,2,2020,0,0,AF,AFG,37172386.0,Friday,February
Afghanistan,2020-02-27,27,2,2020,0,0,AF,AFG,37172386.0,Thursday,February
Afghanistan,2020-02-26,26,2,2020,0,0,AF,AFG,37172386.0,Wednesday,February
Afghanistan,2020-02-25,25,2,2020,1,0,AF,AFG,37172386.0,Tuesday,February
...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,2020-01-05,5,1,2020,0,0,VN,VNM,95540395.0,Sunday,January
Vietnam,2020-01-04,4,1,2020,0,0,VN,VNM,95540395.0,Saturday,January
Vietnam,2020-01-03,3,1,2020,0,0,VN,VNM,95540395.0,Friday,January
Vietnam,2020-01-02,2,1,2020,0,0,VN,VNM,95540395.0,Thursday,January


## Filter out data countriesAndTerritories is afganistan

In [51]:
df=data[data['countriesAndTerritories'].isin(['Afghanistan'])]
df

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April
...,...,...,...,...,...,...,...,...,...,...,...,...
85,2020-01-04,4,1,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,January
86,2020-01-03,3,1,2020,0,0,Afghanistan,AF,AFG,37172386.0,Friday,January
87,2020-01-02,2,1,2020,0,0,Afghanistan,AF,AFG,37172386.0,Thursday,January
88,2020-01-01,1,1,2020,0,0,Afghanistan,AF,AFG,37172386.0,Wednesday,January


In [52]:
df.nunique()

dateRep                    90
day                        31
month                       5
year                        2
cases                      19
deaths                      4
countriesAndTerritories     1
geoId                       1
countryterritoryCode        1
popData2018                 1
Day_name                    7
Month_name                  5
dtype: int64

## Setting one or more columns to index

In [53]:
df1=df.set_index(['countriesAndTerritories','month'])

In [54]:
df1

Unnamed: 0_level_0,Unnamed: 1_level_0,dateRep,day,year,cases,deaths,geoId,countryterritoryCode,popData2018,Day_name,Month_name
countriesAndTerritories,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Afghanistan,4,2020-04-08,8,2020,30,4,AF,AFG,37172386.0,Wednesday,April
Afghanistan,4,2020-04-07,7,2020,38,0,AF,AFG,37172386.0,Tuesday,April
Afghanistan,4,2020-04-06,6,2020,29,2,AF,AFG,37172386.0,Monday,April
Afghanistan,4,2020-04-05,5,2020,35,1,AF,AFG,37172386.0,Sunday,April
Afghanistan,4,2020-04-04,4,2020,0,0,AF,AFG,37172386.0,Saturday,April
Afghanistan,...,...,...,...,...,...,...,...,...,...,...
Afghanistan,1,2020-01-04,4,2020,0,0,AF,AFG,37172386.0,Saturday,January
Afghanistan,1,2020-01-03,3,2020,0,0,AF,AFG,37172386.0,Friday,January
Afghanistan,1,2020-01-02,2,2020,0,0,AF,AFG,37172386.0,Thursday,January
Afghanistan,1,2020-01-01,1,2020,0,0,AF,AFG,37172386.0,Wednesday,January


# pivot_table()

*   The pivot function in Pandas is used to reshape or transform data by pivoting the values of one column into multiple columns.
*   pivot is used to transform long-form data to wide-form
* It's particularly useful when you have long-form data (e.g., data in a tidy format with rows for each observation) and want to convert it into wide-form data with a column for each unique value in another column.



In [55]:
data.pivot_table(index='countriesAndTerritories',columns='Month_name',values='cases',aggfunc='sum')


Month_name,April,December,February,January,March
countriesAndTerritories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,226.0,0.0,1.0,0.0,140.0
Albania,160.0,,,,223.0
Algeria,957.0,0.0,1.0,0.0,510.0
Andorra,175.0,,,,370.0
Angola,10.0,,,,7.0
...,...,...,...,...,...
Uzbekistan,355.0,,,,149.0
Venezuela,31.0,,,,135.0
Vietnam,22.0,0.0,11.0,5.0,213.0
Zambia,4.0,,,,35.0


## Maximum number of cases in each month in countries

In [56]:
Max_cas=data.pivot_table(index='countriesAndTerritories',columns='Month_name',values='cases',aggfunc='max')

In [57]:
Max_cas

Month_name,April,December,February,January,March
countriesAndTerritories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,43.0,0.0,1.0,0.0,33.0
Albania,29.0,,,,28.0
Algeria,314.0,0.0,1.0,0.0,104.0
Andorra,38.0,,,,43.0
Angola,4.0,,,,3.0
...,...,...,...,...,...
Uzbekistan,107.0,,,,29.0
Venezuela,11.0,,,,48.0
Vietnam,6.0,0.0,2.0,3.0,54.0
Zambia,3.0,,,,12.0


## Total death in each month in countries

In [58]:
data.pivot_table(index='countriesAndTerritories',columns='Month_name',values='deaths',aggfunc='sum')

Month_name,April,December,February,January,March
countriesAndTerritories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,7.0,0.0,0.0,0.0,4.0
Albania,10.0,,,,12.0
Algeria,163.0,0.0,0.0,0.0,31.0
Andorra,14.0,,,,8.0
Angola,0.0,,,,2.0
...,...,...,...,...,...
Uzbekistan,0.0,,,,2.0
Venezuela,4.0,,,,3.0
Vietnam,0.0,0.0,0.0,0.0,0.0
Zambia,1.0,,,,0.0


# melt()


* it’s a tool for reshaping data, turning columns into rows, thus ‘melting’ the data structure.
* This process is particularly useful when dealing with wide datasets that you wish to tidy for analysis.
* The melt function takes multiple columns and condenses them into key-value pairs, making the data more accessible and easier to work with.
  



In [59]:
m1=data.melt(id_vars=['countriesAndTerritories','dateRep'],value_vars=['cases','deaths'],var_name='Cases_death',value_name='count')
m1

Unnamed: 0,countriesAndTerritories,dateRep,Cases_death,count
0,Afghanistan,2020-04-08,cases,30
1,Afghanistan,2020-04-07,cases,38
2,Afghanistan,2020-04-06,cases,29
3,Afghanistan,2020-04-05,cases,35
4,Afghanistan,2020-04-04,cases,0
...,...,...,...,...
19021,Zimbabwe,2020-03-25,deaths,0
19022,Zimbabwe,2020-03-24,deaths,1
19023,Zimbabwe,2020-03-23,deaths,0
19024,Zimbabwe,2020-03-22,deaths,0


In [60]:
m2=data.melt(id_vars=['countriesAndTerritories','year'],value_vars=['cases','deaths'],var_name=['cases&death'],value_name='sum')
m2

Unnamed: 0,countriesAndTerritories,year,cases&death,sum
0,Afghanistan,2020,cases,30
1,Afghanistan,2020,cases,38
2,Afghanistan,2020,cases,29
3,Afghanistan,2020,cases,35
4,Afghanistan,2020,cases,0
...,...,...,...,...
19021,Zimbabwe,2020,deaths,0
19022,Zimbabwe,2020,deaths,1
19023,Zimbabwe,2020,deaths,0
19024,Zimbabwe,2020,deaths,0


In [61]:
m3=data.melt(id_vars=['countriesAndTerritories','year','dateRep'],value_vars=['cases','deaths'],var_name=['cases&death'],value_name='values')
m3

Unnamed: 0,countriesAndTerritories,year,dateRep,cases&death,values
0,Afghanistan,2020,2020-04-08,cases,30
1,Afghanistan,2020,2020-04-07,cases,38
2,Afghanistan,2020,2020-04-06,cases,29
3,Afghanistan,2020,2020-04-05,cases,35
4,Afghanistan,2020,2020-04-04,cases,0
...,...,...,...,...,...
19021,Zimbabwe,2020,2020-03-25,deaths,0
19022,Zimbabwe,2020,2020-03-24,deaths,1
19023,Zimbabwe,2020,2020-03-23,deaths,0
19024,Zimbabwe,2020,2020-03-22,deaths,0


# apply()


*  The apply() method is one of the most common methods of data preprocessing.
* It simplifies applying a function on each row or column in a pandas DataFrame.




In [62]:
data

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April
...,...,...,...,...,...,...,...,...,...,...,...,...
9508,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Wednesday,March
9509,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0,Tuesday,March
9510,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Monday,March
9511,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0,Sunday,March


In [63]:
def add_columns(data):
  sum=data['cases']+data['deaths']
  return sum

In [64]:
data['death_cases']=data.apply(lambda x:add_columns(x),axis=1)

In [65]:
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name,death_cases
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April,34
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April,38
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April,31
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April,36
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April,0


In [66]:
data['death_add']=data['deaths'].apply(lambda x: x+5 )

In [67]:
data

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name,death_cases,death_add
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April,34,9
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April,38,5
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April,31,7
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April,36,6
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9508,2020-03-25,25,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Wednesday,March,0,5
9509,2020-03-24,24,3,2020,0,1,Zimbabwe,ZW,ZWE,14439018.0,Tuesday,March,1,6
9510,2020-03-23,23,3,2020,0,0,Zimbabwe,ZW,ZWE,14439018.0,Monday,March,0,5
9511,2020-03-22,22,3,2020,1,0,Zimbabwe,ZW,ZWE,14439018.0,Sunday,March,1,5


In [68]:
def mul_columns(data):
  mul=data['cases']*2
  return mul

In [69]:
data['cases_mul']=data.apply(lambda x:mul_columns(x),axis=1)

In [70]:
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name,death_cases,death_add,cases_mul
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April,34,9,60
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April,38,5,76
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April,31,7,58
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April,36,6,70
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April,0,5,0


In [71]:
data.drop(columns=['death_add','cases_mul','death_cases'],inplace=True,axis=1)

In [72]:
data.head()

Unnamed: 0,dateRep,day,month,year,cases,deaths,countriesAndTerritories,geoId,countryterritoryCode,popData2018,Day_name,Month_name
0,2020-04-08,8,4,2020,30,4,Afghanistan,AF,AFG,37172386.0,Wednesday,April
1,2020-04-07,7,4,2020,38,0,Afghanistan,AF,AFG,37172386.0,Tuesday,April
2,2020-04-06,6,4,2020,29,2,Afghanistan,AF,AFG,37172386.0,Monday,April
3,2020-04-05,5,4,2020,35,1,Afghanistan,AF,AFG,37172386.0,Sunday,April
4,2020-04-04,4,4,2020,0,0,Afghanistan,AF,AFG,37172386.0,Saturday,April


# Groupby


*   The groupby function in pandas is used to split the data into groups based on some criteria, apply a function to each group independently, and then combine the results.
* This is often summarized by the "split-apply-combine" pattern.
  



## Reported cases in each country

In [73]:
data.groupby('countryterritoryCode')['cases'].sum()

countryterritoryCode
ABW      74
AFG     367
AGO      17
ALB     383
AND     545
       ... 
VNM     251
XKX     184
ZAF    1749
ZMB      39
ZWE      10
Name: cases, Length: 199, dtype: int64

## Reported deaths in each country

In [74]:
data.groupby('countryterritoryCode')['deaths'].sum()

countryterritoryCode
ABW     0
AFG    11
AGO     2
ALB    22
AND    22
       ..
VNM     0
XKX     5
ZAF    13
ZMB     1
ZWE     1
Name: deaths, Length: 199, dtype: int64

In [75]:
data.groupby(['Month_name','countryterritoryCode'])['deaths'].sum()

Month_name  countryterritoryCode
April       ABW                      0
            AFG                      7
            AGO                      0
            ALB                     10
            AND                     14
                                    ..
March       VNM                      0
            XKX                      1
            ZAF                      3
            ZMB                      0
            ZWE                      1
Name: deaths, Length: 588, dtype: int64

In [76]:
data.groupby(['countryterritoryCode','Month_name']).agg({'cases':'sum','deaths':'max'})

Unnamed: 0_level_0,Unnamed: 1_level_0,cases,deaths
countryterritoryCode,Month_name,Unnamed: 2_level_1,Unnamed: 3_level_1
ABW,April,24,0
ABW,March,50,0
AFG,April,226,4
AFG,December,0,0
AFG,February,1,0
...,...,...,...
ZAF,March,1326,2
ZMB,April,4,1
ZMB,March,35,0
ZWE,April,3,0


In [77]:
data.groupby(['countryterritoryCode','Month_name'])['cases'].agg(['max','min','sum'])

Unnamed: 0_level_0,Unnamed: 1_level_0,max,min,sum
countryterritoryCode,Month_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ABW,April,7,0,24
ABW,March,22,0,50
AFG,April,43,0,226
AFG,December,0,0,0
AFG,February,1,0,1
...,...,...,...,...
ZAF,March,243,0,1326
ZMB,April,3,0,4
ZMB,March,12,0,35
ZWE,April,1,0,3


In [78]:
data.columns

Index(['dateRep', 'day', 'month', 'year', 'cases', 'deaths',
       'countriesAndTerritories', 'geoId', 'countryterritoryCode',
       'popData2018', 'Day_name', 'Month_name'],
      dtype='object')

In [80]:
data['month'].corr(data['deaths'])

0.08492557728251982