# Phase 1: Data Acquisition and Cleaning

## Project Foundation:

- The README identifies the dataset and explains  its suitability/justification 

In [1]:
# Import Dependencies

import pandas as pd
import numpy as np

In [4]:
# Import CSV and Create the DataFrame
data = "./owid-covid-data.csv"
covid_df = pd.read_csv(data)

## Data Cleaning:

 ### Raw Data Examination

Before cleaning, we systematically examine the raw COVID-19 dataset to identify potential issues.

The next 5 code cells do the following

1. **Check dataset shape** – number of rows and columns.  
2. **Check column data types** – numeric, categorical, and dates.  
3. **Check for missing values** – which columns have nulls and how many.  
4. **Check basic statistics** – min, max, mean, etc., to spot outliers.  
5. **Preview a few rows** – to detect inconsistencies or unexpected values.


In [8]:
# 1. Dataset shape
print("Dataset shape:", covid_df.shape)

Dataset shape: (429435, 67)


In [9]:
# 2. Column data types
print("\nColumn Data Types:\n", covid_df.dtypes)


Column Data Types:
 iso_code                                    object
continent                                   object
location                                    object
date                                        object
total_cases                                float64
                                            ...   
population                                   int64
excess_mortality_cumulative_absolute       float64
excess_mortality_cumulative                float64
excess_mortality                           float64
excess_mortality_cumulative_per_million    float64
Length: 67, dtype: object


In [10]:
# 3. Count of missing values per column
missing_values = covid_df.isna().sum()
print("\nMissing Values:\n", missing_values[missing_values > 0])


Missing Values:
 continent                                   26525
total_cases                                 17631
new_cases                                   19276
new_cases_smoothed                          20506
total_deaths                                17631
                                            ...  
human_development_index                    110308
excess_mortality_cumulative_absolute       416024
excess_mortality_cumulative                416024
excess_mortality                           416024
excess_mortality_cumulative_per_million    416024
Length: 63, dtype: int64


In [11]:
# 4. Basic statistics for numeric columns
print("\nSummary Statistics:\n", covid_df.describe())


Summary Statistics:
         total_cases     new_cases  new_cases_smoothed  total_deaths  \
count  4.118040e+05  4.101590e+05        4.089290e+05  4.118040e+05   
mean   7.365292e+06  8.017360e+03        8.041026e+03  8.125957e+04   
std    4.477582e+07  2.296649e+05        8.661611e+04  4.411901e+05   
min    0.000000e+00  0.000000e+00        0.000000e+00  0.000000e+00   
25%    6.280750e+03  0.000000e+00        0.000000e+00  4.300000e+01   
50%    6.365300e+04  0.000000e+00        1.200000e+01  7.990000e+02   
75%    7.582720e+05  0.000000e+00        3.132900e+02  9.574000e+03   
max    7.758668e+08  4.423623e+07        6.319461e+06  7.057132e+06   

          new_deaths  new_deaths_smoothed  total_cases_per_million  \
count  410608.000000        409378.000000            411804.000000   
mean       71.852139            72.060828            112096.199420   
std      1368.322990           513.636565            162240.412405   
min         0.000000             0.000000                 

In [12]:
# 5. Preview first 5 rows
covid_df.head()

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
1,AFG,Asia,Afghanistan,2020-01-06,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
2,AFG,Asia,Afghanistan,2020-01-07,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
3,AFG,Asia,Afghanistan,2020-01-08,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,
4,AFG,Asia,Afghanistan,2020-01-09,0.0,0.0,,0.0,0.0,,...,,37.75,0.5,64.83,0.51,41128772,,,,


In [14]:
# Print column names
print(covid_df.columns)


Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

### Data cleaning

In [15]:
# Check for negative numbers in key columns
cols_to_check = ['new_cases', 'new_deaths']
for col in cols_to_check:
    if col in covid_df.columns:
        neg_count = (covid_df[col] < 0).sum()
        print(f"{col} negative values:", neg_count)

new_cases negative values: 0
new_deaths negative values: 0


In [16]:
# changing date to datetime object
covid_df['date'] = pd.to_datetime(covid_df['date'])
print("\nColumn Data Types:\n", covid_df.dtypes)


Column Data Types:
 iso_code                                           object
continent                                          object
location                                           object
date                                       datetime64[ns]
total_cases                                       float64
                                                ...      
population                                          int64
excess_mortality_cumulative_absolute              float64
excess_mortality_cumulative                       float64
excess_mortality                                  float64
excess_mortality_cumulative_per_million           float64
Length: 67, dtype: object
