In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read in the data
filename = 'owid-covid-data.csv'
path = 'C:/Users/Matth/git/DataAnalysisWorkbooks/Covid19/Data/Raw_data/'

data = pd.read_csv(path+filename) 

Dealing with missing data: checking, removing, and replacing

In [3]:
# Check for missing (NULL) data
data.isnull().sum()

iso_code                                        0
continent                                    9202
location                                        0
date                                            0
total_cases                                  2629
                                            ...  
human_development_index                     27361
excess_mortality_cumulative_absolute       147461
excess_mortality_cumulative                147461
excess_mortality                           147461
excess_mortality_cumulative_per_million    147461
Length: 67, dtype: int64

In [24]:
# What percentage of new_cases entries are NULL?
print(data['new_cases'].isnull().sum() / ( data['new_cases'].isnull().sum() + data['new_cases'].notnull().sum() ) * 100)

# What percentage of excess mortality entries are NULL?
print(data['excess_mortality'].isnull().sum() / (data['excess_mortality'].isnull().sum() + data['excess_mortality'].notnull().sum() ) * 100)

1.754477880742657
96.57225187465208


In [17]:
# Ignore NULL values (Note: This does not change the original dataframe)
data['excess_mortality'].dropna()

1381       2.88
1412       1.31
1442       4.04
1473       7.00
1503      10.22
          ...  
146884    -4.48
146914    19.01
146945    25.95
146976    40.99
147006    58.90
Name: excess_mortality, Length: 5234, dtype: float64

In [21]:
# .dropna() on a dataframe will drop ANY row that contains a NULL value
data.dropna()

# or drop rows where ALL the coumns contain NULL
data.dropna(how = 'all')

# or a specified threshold of NULL entries
data.dropna(thresh = 3)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-02-24,5.0,5.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
1,AFG,Asia,Afghanistan,2020-02-25,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
2,AFG,Asia,Afghanistan,2020-02-26,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
3,AFG,Asia,Afghanistan,2020-02-27,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
4,AFG,Asia,Afghanistan,2020-02-28,5.0,0.0,,,,,...,,,37.746,0.5,64.83,0.511,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152690,ZWE,Africa,Zimbabwe,2022-01-02,214214.0,0.0,1409.000,5017.0,0.0,18.000,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
152691,ZWE,Africa,Zimbabwe,2022-01-03,216087.0,1873.0,1519.714,5047.0,30.0,19.857,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
152692,ZWE,Africa,Zimbabwe,2022-01-04,217678.0,1591.0,1447.143,5078.0,31.0,19.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,
152693,ZWE,Africa,Zimbabwe,2022-01-05,219057.0,1379.0,1644.143,5092.0,14.0,21.714,...,1.6,30.7,36.791,1.7,61.49,0.571,,,,


In [26]:
# Replacing null values with zero
data['excess_mortality'].fillna(0)

# Replacing null values with the mean of that column
print(data['new_cases'].mean())
data['new_cases'].fillna(data['new_cases'].mean())

8396.612681313993


0            5.0
1            0.0
2            0.0
3            0.0
4            0.0
           ...  
152690       0.0
152691    1873.0
152692    1591.0
152693    1379.0
152694    1121.0
Name: new_cases, Length: 152695, dtype: float64

Cleaning data (non-missing)

In [31]:
# Display unique value count of entries (Easy way to spot bad values for discrete data)
data['continent'].value_counts()

# If you have any badly labeled continents (say, Afrida), replace the bad ones with correct ones
# data['continent'].replace({'Afrida' : 'Africa'})

Africa           36227
Europe           34121
Asia             33390
North America    22892
South America     8828
Oceania           8035
Name: continent, dtype: int64

In [42]:
# Check for extreme values in a column
data[data['new_cases'] > 2000000]

# We can see large values of daily cases are coming from aggregate locations (World, etc.)

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,female_smokers,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
61322,OWID_HIC,,High income,2022-01-03,138802320.0,2251178.0,1342873.0,1946767.0,3528.0,3141.0,...,,,,,,,,,,
61323,OWID_HIC,,High income,2022-01-04,140932631.0,2130311.0,1490556.286,1951086.0,4319.0,3123.286,...,,,,,,,,,,
61324,OWID_HIC,,High income,2022-01-05,142962071.0,2029440.0,1567338.143,1955965.0,4879.0,3120.0,...,,,,,,,,,,
150736,OWID_WRL,,World,2022-01-03,292722587.0,2522558.0,1594957.857,5449850.0,5865.0,5911.0,...,6.434,34.635,60.13,2.705,72.58,0.737,,,,
150737,OWID_WRL,,World,2022-01-04,295263477.0,2540890.0,1771742.857,5457604.0,7754.0,5979.857,...,6.434,34.635,60.13,2.705,72.58,0.737,,,,
150738,OWID_WRL,,World,2022-01-05,297770440.0,2506963.0,1886491.143,5465352.0,7748.0,5960.857,...,6.434,34.635,60.13,2.705,72.58,0.737,,,,
150739,OWID_WRL,,World,2022-01-06,300290277.0,2519837.0,1964838.714,5472566.0,6663.0,5855.429,...,6.434,34.635,60.13,2.705,72.58,0.737,,,,


In [48]:
# Check for duplicates
data['continent'].duplicated()

# Note: data.duplicated() checks for rows with all columns exactly the same.

# By default duplicated keeps first row of occurance, to keep the last row of occurance instead:
# data['continent'].duplicated(keep = 'last')  # or keep=False to remove everything that is duplicated

0         False
1          True
2          True
3          True
4          True
          ...  
152690     True
152691     True
152692     True
152693     True
152694     True
Name: continent, Length: 152695, dtype: bool

In [56]:
# String cleaning

# Consider a column with header name this_header. To remove characters:
data['date'].str.split('-')  # .str for obj, .dt for data, .cat for category
                             # with ", expand=True", makes new columns out of the split strings

0         [2020, 02, 24]
1         [2020, 02, 25]
2         [2020, 02, 26]
3         [2020, 02, 27]
4         [2020, 02, 28]
               ...      
152690    [2022, 01, 02]
152691    [2022, 01, 03]
152692    [2022, 01, 04]
152693    [2022, 01, 05]
152694    [2022, 01, 06]
Name: date, Length: 152695, dtype: object

In [57]:
data['date'].str.replace('-', "/")  # Replace the delimiter with something else

0         2020/02/24
1         2020/02/25
2         2020/02/26
3         2020/02/27
4         2020/02/28
             ...    
152690    2022/01/02
152691    2022/01/03
152692    2022/01/04
152693    2022/01/05
152694    2022/01/06
Name: date, Length: 152695, dtype: object