# Data Type Corrections for Medical Dataset

This notebook implements Task 1 of the assignment, focusing on examining and correcting data types in the medical dataset.

## Import Required Libraries

In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from pandas.api.types import CategoricalDtype

warnings.filterwarnings("ignore")

## Load and Display Initial Data

In [None]:
# Load the dataset
df = pd.read_excel("C:\\Users\\galle\\Documents\\www\\qut\\assignment1\\medical-dataset.xlsx")

print("--- Initial Data Types ---")
df.dtypes

## Task 1: Data Type Corrections

We'll now implement the necessary data type corrections for each variable in the dataset.

### 1. Date Conversion

In [23]:
def convert_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        for fmt in ['%Y-%m-%d %H:%M:%S', '%d/%m/%Y', '%Y-%m-%d']:
            try:
                return pd.to_datetime(date_str, format=fmt)
            except:
                continue
        return pd.to_datetime(date_str, infer_datetime_format=True)
    except:
        return pd.NaT

df['survey_date'] = df['survey_date'].apply(convert_date)

### 2. Age - Convert to Ordered Categorical

In [None]:
df['age'] = df['age'].replace("100_110", "90_plus")
age_order = ['0_10', '10_20', '20_30', '30_40', '40_50', '50_60', 
             '60_70', '70_80', '80_90', '90_plus']
age_dtype = pd.api.types.CategoricalDtype(categories=age_order, ordered=True)
df['age'] = df['age'].astype(age_dtype)

# Display age categories and their counts
print("Age Categories:")
print(df['age'].cat.categories)
print("\nAge Value Counts:")
df['age'].value_counts(sort=False)

### 3. Gender - Convert to Nominal Categorical

In [None]:
df['gender'] = df['gender'].astype('category')
print("Gender Categories:")
df['gender'].value_counts()

### 4. Region and Country - Convert to Categorical with 'Unknown'

In [None]:
for col in ['region', 'country']:
    df[col] = df[col].astype('category')
    if 'Unknown' not in df[col].cat.categories:
        df[col] = df[col].cat.add_categories('Unknown')
    df[col] = df[col].fillna('Unknown')
    
    print(f"\n{col.capitalize()} Categories:")
    print(df[col].value_counts())

### 5. Height and Weight - Convert to Integer

In [None]:
df['height'] = df['height'].astype('Int64')  # Using Int64 to handle NaN values
df['weight'] = df['weight'].astype('Int64')

print("Height Summary:")
print(df['height'].describe())
print("\nWeight Summary:")
print(df['weight'].describe())

### 6. BMI - Keep as Float

In [None]:
df['bmi'] = df['bmi'].astype('float64')
print("BMI Summary:")
df['bmi'].describe()

### 7. Blood Type - Convert to Categorical with 'Unknown'

In [None]:
df['blood_type'] = df['blood_type'].astype('category')
if 'Unknown' not in df['blood_type'].cat.categories:
    df['blood_type'] = df['blood_type'].cat.add_categories('Unknown')
df['blood_type'] = df['blood_type'].fillna('Unknown')

print("Blood Type Categories:")
df['blood_type'].value_counts()

### 8. Insurance - Convert to Boolean with NA for Unknown

In [None]:
df['insurance'] = df['insurance'].map({'Yes': True, 'No': False, 'Unknown': pd.NA})
print("Insurance Value Counts:")
df['insurance'].value_counts()

### 9. Income - Convert to Categorical

In [None]:
df['income'] = df['income'].astype('category')
print("Income Categories:")
df['income'].value_counts()

### 10. Smoking - Convert to Categorical with Bins

In [None]:
smoking_bins = {
    'quit0': 'Former Smoker',
    'quit5': 'Former Smoker',
    'quit10': 'Former Smoker',
    'vape': 'Vape User',
    'yesheavy': 'Heavy Smoker',
    'light': 'Light Smoker',
    'medium': 'Medium Smoker',
    'never': 'Never Smoked',
    -1: 'Unknown'
}
df['smoking'] = df['smoking'].replace(smoking_bins)
df['smoking'] = df['smoking'].astype('category')

print("Smoking Categories:")
df['smoking'].value_counts()

### 11. Alcohol - Convert to Categorical Ordinal

In [None]:
df['alcohol'] = df['alcohol'].replace(-1, 'Unknown')
df['alcohol'] = df['alcohol'].astype('category')

print("Alcohol Categories:")
df['alcohol'].value_counts()

### 12. Cocaine - Convert to Categorical with Unknown

In [None]:
df['cocaine'] = df['cocaine'].replace(-1, 'Unknown')
df['cocaine'] = df['cocaine'].astype('category')

print("Cocaine Categories:")
df['cocaine'].value_counts()

### 13. Contact Counts - Convert to Int64

In [None]:
df['contacts_count'] = df['contacts_count'].astype('Int64')
df['public_transport_count'] = df['public_transport_count'].astype('Int64')

print("Contacts Count Summary:")
print(df['contacts_count'].describe())
print("\nPublic Transport Count Summary:")
print(df['public_transport_count'].describe())

### 14. Working - Convert to Categorical

In [None]:
df['working'] = df['working'].astype('category')
print("Working Status Categories:")
df['working'].value_counts()

### 15. Worried - Convert to Integer

In [None]:
df['worried'] = df['worried'].astype('Int64')
print("Worried Level Summary:")
df['worried'].describe()

### 16. Health Conditions - Convert to Boolean

In [None]:
health_conditions = [
    'covid19_positive', 'covid19_symptoms', 'covid19_contact',
    'asthma', 'kidney_disease', 'liver_disease', 'compromised_immune',
    'heart_disease', 'lung_disease', 'diabetes', 'hiv_positive',
    'other_chronic', 'nursing_home', 'health_worker'
]

for col in health_conditions:
    df[col] = df[col].astype('bool')
    
print("Health Conditions Summary:")
for col in health_conditions:
    print(f"\n{col}:")
    print(df[col].value_counts())

### 17. Risk Values - Convert to Float and Normalize

In [None]:
df['risk_infection'] = df['risk_infection'].astype('float64') / 100
df['risk_mortality'] = df['risk_mortality'].astype('float64') / 100

print("Risk Infection Summary:")
print(df['risk_infection'].describe())
print("\nRisk Mortality Summary:")
print(df['risk_mortality'].describe())

## Final Data Types Check

In [None]:
print("Final Data Types:")
df.dtypes