In [1]:
!pip install pandas



In [4]:
import pandas as pd

import numpy as np


load the data

In [8]:
df3 = pd.read_csv(
    "C:\\Users\\varle\\Downloads\\Python heackathon Data\\schema_3_ontario_final.csv"
)

df3.head()

Unnamed: 0,month,fsa,probable,vulnerable,fever_chills_shakes,cough,shortness_of_breath,any_medical_conditions,travel_outside_canada,contact_with_illness,...,ethnicity,sex,needs,age_1,mental_health_impact,travel_work_school,self_isolating,media_channels,financial_obligations_impact,tobacco_usage
0,April,M4P,n,n,n,n,n,y,n,n,...,,,noneOfTheAbove,<26,,,,,,
1,April,M9P,n,n,n,n,n,y,n,n,...,caucasian,f,food,<26,noImpact,stoppedTravelling,y,twitter;tv;reddit,,n
2,April,K0L,n,n,n,n,n,y,n,n,...,,,noneOfTheAbove,45-64,,,,,,
3,April,N9E,n,n,n,n,n,y,n,n,...,,,noneOfTheAbove,45-64,,,,,,
4,April,L2N,n,n,n,n,n,y,n,n,...,,,financialSupport,<26,,,,,,


Quick sanity checks

In [9]:
df3.shape
df3.columns
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15534 entries, 0 to 15533
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   month                         15534 non-null  object
 1   fsa                           15534 non-null  object
 2   probable                      15534 non-null  object
 3   vulnerable                    15534 non-null  object
 4   fever_chills_shakes           15534 non-null  object
 5   cough                         15534 non-null  object
 6   shortness_of_breath           15534 non-null  object
 7   any_medical_conditions        15534 non-null  object
 8   travel_outside_canada         15534 non-null  object
 9   contact_with_illness          15534 non-null  object
 10  contact_in_household          159 non-null    object
 11  tested                        15534 non-null  object
 12  covid_results_date            421 non-null    object
 13  covid_positive  

Reasoning

info() → missing values & data types

describe(include="all") → category frequency


Remove duplicates

In [None]:
initial_rows = len(df3)

df3 = df3.drop_duplicates()

print(f"Removed {initial_rows - len(df3)} duplicate rows")

Standardize column names

Cleaner names = easier modeling later.

In [14]:
df3.columns = (
    df3.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

Identify missing values properly

In [16]:
missing_summary = df3.isna().mean().sort_values(ascending=False)
missing_summary

contact_in_household            0.987654
ethnicity                       0.976803
covid_results_date              0.967288
covid_positive                  0.961862
symptoms                        0.937643
financial_obligations_impact    0.928285
conditions                      0.891641
media_channels                  0.558937
tobacco_usage                   0.334985
self_isolating                  0.334906
travel_work_school              0.334827
sex                             0.334670
mental_health_impact            0.334277
travel_outside_canada           0.000000
contact_with_illness            0.000000
fever_chills_shakes             0.000000
vulnerable                      0.000000
probable                        0.000000
fsa                             0.000000
month                           0.000000
any_medical_conditions          0.000000
shortness_of_breath             0.000000
cough                           0.000000
tested                          0.000000
needs           

Reasoning:

Columns with >40–50% missing often:

Were optional survey questions

Should be treated as "unknown" not dropped

Normalize binary yes/no columns

First identify them:

In [18]:
binary_cols = [
    col for col in df3.columns
    if set(df3[col].dropna().unique()).issubset({"y", "n"})
]
binary_cols

['probable',
 'vulnerable',
 'fever_chills_shakes',
 'cough',
 'shortness_of_breath',
 'any_medical_conditions',
 'travel_outside_canada',
 'contact_with_illness',
 'contact_in_household',
 'tested',
 'self_isolating']

Convert:

In [19]:
df3[binary_cols] = df3[binary_cols].replace({"y": 1, "n": 0})

  df3[binary_cols] = df3[binary_cols].replace({"y": 1, "n": 0})


Handle missing values 
Rule of thumb for survey data:

Binary columns → missing = 0 (not reported)

Categorical columns → "unknown"

In [21]:
# Binary columns
df3[binary_cols] = df3[binary_cols].fillna(0)

# Remaining categorical columns
categorical_cols = df3.select_dtypes(include="object").columns

df3[categorical_cols] = df3[categorical_cols].fillna("unknown")


: Clean age column (age_1)

Typical problem: bucket strings like <26, 26-40, >60

In [22]:
df3["age_1"].value_counts()

age_1
45-64    5054
26-44    4300
>65      1795
<26      1568
Name: count, dtype: int64

Optional: map to ordinal values (if needed)

In [24]:
age_map = {
    "<26": 1,
    "26-40": 2,
    "41-60": 3,
    ">60": 4,
    "unknown": np.nan
}

df3["age_group"] = df3["age_1"].map(age_map)

Normalize sex & ethnicity

In [25]:
df3["sex"] = (
    df3["sex"]
    .str.lower()
    .replace({
        "male": "male",
        "female": "female",
        "other": "other",
        "unknown": "unknown"
    })
)

cleaning ethnicity
Inspect raw values

In [26]:
df3["ethnicity"].value_counts(dropna=False)

ethnicity
unknown                                                 12422
caucasian;hispanic/latino                                  21
east asian;caucasian                                       21
caucasian;firstNations                                     19
caucasian;other                                            18
                                                        ...  
firstNations;inuit;black/african;caucasian                  1
caucasian;east asian;other                                  1
black/african;south asian;east asian;caucasian              1
caucasian;premieres nations                                 1
black/african;south asian;east asian;hispanic/latino        1
Name: count, Length: 63, dtype: int64

In [None]:
Normalize ethnicity values

In [None]:
download clean file

In [6]:
output_path = r"C:\Users\varle\Downloads\cleaned_schema_3_ontario.csv"

df3.to_csv(output_path, index=False)

print("Cleaned file saved successfully to Downloads folder")

Cleaned file saved successfully to Downloads folder
