
# Introduction
In this document, we attempt to establish a direct correlation between the risk of COVID-19 infection, mortality, and smoking by exploring the Nexoid COVID-19 medical dataset. 
Additionally, we identify other insights and relationships between features that provide a more nuanced overview of the subject matter, including the impact of pre-existing health conditions and lifestyle factors that contribute to the risk of infection and risk mortality.

In [301]:
# Import Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
from pandas.api.types import CategoricalDtype
from scipy.stats import chi2_contingency

warnings.filterwarnings("ignore")

In [None]:
# Load the dataset
df = pd.read_csv("medical-dataset.csv")

print("--- Initial Data Types ---")
print(df.dtypes)

# Task 1 - Data Type Corrections for Medical Dataset

### Insurance correction

In [None]:
# Check data type of 'insurance' column
print("Data type of 'insurance' column:", df['insurance'].dtype)

# Check for missing values
print("Number of missing values in 'insurance' column:", df['insurance'].isnull().sum())

# Display the first few rows of the 'insurance' column
print("First few rows of 'insurance' column:")
print(df['insurance'].head())

# Display the last few rows of 'insurance' column
print("Last few rows of 'insurance' column:")
print(df['insurance'].tail())

In [None]:
df['insurance'].value_counts()

### Date Conversion

In [305]:
def convert_date(date_str):
    if pd.isna(date_str):
        return pd.NaT
    try:
        for fmt in ['%Y-%m-%d %H:%M:%S', '%d/%m/%Y', '%Y-%m-%d']:
            try:
                return pd.to_datetime(date_str, format=fmt)
            except:
                continue
        return pd.to_datetime(date_str, infer_datetime_format=True)
    except:
        return pd.NaT

df['survey_date'] = df['survey_date'].apply(convert_date)


# ##

### Region and Country - Convert to Categorical with 'Unknown'

In [None]:
for col in ['region', 'country']:
    df[col] = df[col].astype('category')
    if 'Unknown' not in df[col].cat.categories:
        df[col] = df[col].cat.add_categories('Unknown')
    df[col] = df[col].fillna('Unknown')
    
    print(f"\n{col.capitalize()} Categories:")
    print(df[col].value_counts())

### Drop longitude and latitude due to no meaningful significance

In [307]:
df.drop(columns=['ip_longitude', 'ip_latitude'], inplace=True)

### Age - Convert to Ordered Categorical

In [None]:
df['age'] = df['age'].replace("100_110", "90_plus")
age_order = ['0_10', '10_20', '20_30', '30_40', '40_50', '50_60', 
             '60_70', '70_80', '80_90', '90_plus']
age_dtype = pd.api.types.CategoricalDtype(categories=age_order, ordered=True)
df['age'] = df['age'].astype(age_dtype)

# Display age categories and their counts
print("Age Categories:")
print(df['age'].cat.categories)
print("\nAge Value Counts:")
df['age'].value_counts(sort=False)

### Gender

In [None]:
df['gender'] = df['gender'].astype('category')
print("Gender Categories:")
df['gender'].value_counts()

### Height and Weight - Convert to Integer

In [None]:
df['height'] = df['height'].astype('Int64')  # Using Int64 to handle NaN values
df['weight'] = df['weight'].astype('Int64')

print("Height Summary:")
print(df['height'].describe())
print("\nWeight Summary:")
print(df['weight'].describe())


# ### 6. BMI - Keep as Float

 ### Blood Type - Convert to Categorical with 'Unknown'

In [None]:
df['bmi'] = df['bmi'].astype('float64')
print("BMI Summary:")
df['bmi'].describe()

In [None]:
df['blood_type'] = df['blood_type'].astype('category')
if 'Unknown' not in df['blood_type'].cat.categories:
    df['blood_type'] = df['blood_type'].cat.add_categories('Unknown')
df['blood_type'] = df['blood_type'].fillna('Unknown')

print("Blood Type Categories:")
df['blood_type'].value_counts()

### Insurance to categorical

In [None]:
# Convert to Boolean with NA for Blank and Unknown
df['insurance'] = df['insurance'].str.title()
df['insurance'] = df['insurance'].map({'Yes': True, 'No': False, 'Blank': pd.NA, 'Unknown': pd.NA})
print("Insurance Value Counts:")
df['insurance'].value_counts()

### Income - Convert to Categorical

In [None]:
df['income'] = df['income'].astype('category')
print("Income Categories:")
df['income'].value_counts()


# ### 10. Smoking - Convert to Categorical with Bins

### Smoking to bins

In [None]:
smoking_bins = {
    'quit0': 'Former Smoker',
    'quit5': 'Former Smoker',
    'quit10': 'Former Smoker',
    'vape': 'Vape User',
    'yesheavy': 'Heavy Smoker',
    'light': 'Light Smoker',
    'medium': 'Medium Smoker',
    'never': 'Never Smoked',
    -1: 'Unknown'
}
df['smoking'] = df['smoking'].replace(smoking_bins)
df['smoking'] = df['smoking'].astype('category')

print("Smoking Categories:")
df['smoking'].value_counts()

### Alcohol - Convert to Categorical Ordinal

In [None]:
df['alcohol'] = df['alcohol'].replace(-1, 'Unknown')
df['alcohol'] = df['alcohol'].astype('category')

print("Alcohol Categories:")
df['alcohol'].value_counts()


# ### 12. Cocaine - Convert to Categorical with Unknown

In [None]:
df['cocaine'] = df['cocaine'].replace(-1, 'Unknown')
df['cocaine'] = df['cocaine'].astype('category')

print("Cocaine Categories:")
df['cocaine'].value_counts()

### Contact Counts - Convert to Int64

In [None]:
df['contacts_count'] = df['contacts_count'].astype('Int64')
df['public_transport_count'] = df['public_transport_count'].astype('Int64')

print("Contacts Count Summary:")
print(df['contacts_count'].describe())
print("\nPublic Transport Count Summary:")
print(df['public_transport_count'].describe())

### Working - Convert to Categorical

In [None]:
df['working'] = df['working'].astype('category')
print("Working Status Categories:")
df['working'].value_counts()

### Worried - Convert to Integer

In [None]:
df['worried'] = df['worried'].astype('Int64')
print("Worried Level Summary:")
df['worried'].describe()

### Health Conditions - Convert to Boolean

In [None]:
health_conditions = [
    'covid19_positive', 'covid19_symptoms', 'covid19_contact',
    'asthma', 'kidney_disease', 'liver_disease', 'compromised_immune',
    'heart_disease', 'lung_disease', 'diabetes', 'hiv_positive',
    'other_chronic', 'nursing_home', 'health_worker'
]

for col in health_conditions:
    df[col] = df[col].astype('bool')
    
print("Health Conditions Summary:")
for col in health_conditions:
    print(f"\n{col}:")
    print(df[col].value_counts())

### Risk Values - Convert to Float and Normalize

In [None]:
df['risk_infection'] = df['risk_infection'].astype('float64') / 100
df['risk_mortality'] = df['risk_mortality'].astype('float64') / 100

print("Risk Infection Summary:")
print(df['risk_infection'].describe())
print("\nRisk Mortality Summary:")
print(df['risk_mortality'].describe())

### Final Data Types Check

In [None]:
print("Final Data Types:")
df.dtypes


# Task 2 - Data Preparation - Identify issues and clean/transform

### Identify Skewness in Numerical Variables

In [None]:
numerical_cols = df.select_dtypes(include=[np.number]).columns
skewness_results = {}

for col in numerical_cols:
    if col in df.columns:
        skewness = df[col].skew()
        skewness_results[col] = skewness
        print(f"{col}: {skewness:.3f}")

# Visualize distributions
plt.figure(figsize=(15, 10))
for i, col in enumerate(['height', 'weight', 'bmi', 'contacts_count', 'public_transport_count', 'worried', 'risk_infection', 'risk_mortality'], 1):
    plt.subplot(4, 2, i)
    df[col].hist(bins=30)
    plt.title(f'Distribution of {col} (Skewness: {df[col].skew():.3f})')
    plt.xlabel(col)
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

### Identify Missing Values and Errors

In [None]:
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

### Visualise the missing data...

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False)
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# Again as histogram
print(df.isnull().sum())

# Plotting missing values
missing_counts = df.isnull().sum()
missing_counts = missing_counts[missing_counts > 0] # Only plot columns with missing values
plt.figure(figsize=(10, 6))
sns.barplot(x=missing_counts.index, y=missing_counts.values)
plt.xticks(rotation=45, ha='right')
plt.title('Count of Missing Values per Column')
plt.ylabel('Number of Missing Values')
plt.tight_layout()
plt.show()

### Potential Anomalies in Numerical Data

In [None]:
for col in ['height', 'weight', 'bmi', 'public_transport_count']:
    print(f"\n{col} - Outliers based on IQR:")
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    print(f"Number of outliers: {len(outliers)}")
    if len(outliers) > 0:
        print(f"Range of outliers: {outliers.min()} to {outliers.max()}")

     # Add a box plot for visualization
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=df[col])
    plt.title(f"Box plot of {col}")
    plt.xlabel(col)
    plt.show()

### Fill missing Risk of Infection percentages with mean values within age brackets

In [None]:
# Calcu late the mean 'risk_infection' for each 'age' group
age_group_means = df.groupby('age')['risk_infection'].transform('mean')

# Fill missing 'risk_infection' values with the calculated age group means
df['risk_infection'].fillna(age_group_means, inplace=True)

# fill remaining NaNs with the global mean of 'risk_infection'
global_mean_risk_infection = df['risk_infection'].mean()
df['risk_infection'].fillna(global_mean_risk_infection, inplace=True)

# Verify that there are no more missing risk_infection values
print(f"Missing 'risk_infection' values after processing: {df['risk_infection'].isnull().sum()}")
print(df.groupby('age')['risk_infection'].mean().head())

### Fill Missing BMI Values with Median

In [None]:
bmi_median = df['bmi'].median()
df['bmi'].fillna(bmi_median, inplace=True)

print(f"Missing BMI values filled with median: {bmi_median}")
print(f"Number of remaining NaNs in BMI: {df['bmi'].isnull().sum()}")

In [None]:
numerical_cols = ['height', 'weight', 'bmi', 'risk_infection', 'risk_mortality']

plt.figure(figsize=(15, 10))

for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.histplot(df[col].dropna(), kde=True)
    plt.title(f'Distribution of {col.replace("_", " ").title()}')
    plt.xlabel(col.replace("_", " ").title())
    plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

# Task 3 - Data Mining and Feature Selection

### Calculation infection rates by smoking category

In [None]:
# Create  table for smoking status vs COVID-19 positive
contingency_table = pd.crosstab(df["smoking"], df["covid19_positive"])

print("--- COVID-19 Infection Analysis by Smoking Status ---\n")

# Display frequency distributions
print("--- COVID-19 Positive Distribution ---")
covid_dist = df["covid19_positive"].value_counts(normalize=True) * 100
print(f"Positive: {covid_dist[True]:.1f}%")
print(f"Negative: {covid_dist[False]:.1f}%")

print("\n--- Smoking Status Distribution ---")
smoking_dist = df["smoking"].value_counts(normalize=True) * 100
for category, percentage in smoking_dist.items():
    print(f"{category}: {percentage:.1f}%")

    
# Calculate infection rates by smoking category
infection_rates = contingency_table.div(contingency_table.sum(axis=1), axis=0) * 100
infection_summary = pd.DataFrame({
    'Total_Cases': contingency_table.sum(axis=1),
    'COVID_Positive': contingency_table[True],
    'Infection_Rate_%': infection_rates[True].round(2)
}).sort_values('Infection_Rate_%', ascending=False)

print("--- COVID-19 Infection Rates by Smoking Status ---")
print(infection_summary.to_string())

### Visualise relationship

In [None]:

# Visualize the relationship
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Stacked bar chart
contingency_table.plot(kind="bar", stacked=True, ax=ax1, 
                      color=['lightblue', 'coral'], alpha=0.8)
ax1.set_title("COVID-19 Cases by Smoking Status", fontweight='bold', fontsize=12)
ax1.set_ylabel("Number of Cases")
ax1.set_xlabel("Smoking Status")
ax1.legend(['Negative', 'Positive'], title='COVID-19 Status')
ax1.tick_params(axis='x', rotation=45)

# Infection rate bar chart
infection_summary['Infection_Rate_%'].plot(kind='bar', ax=ax2, 
                                         color='crimson', alpha=0.7)
ax2.set_title("COVID-19 Infection Rate by Smoking Status", fontweight='bold', fontsize=12)
ax2.set_ylabel("Infection Rate (%)")
ax2.set_xlabel("Smoking Status")
ax2.tick_params(axis='x', rotation=45)

# Add sample size annotations
for i, (idx, row) in enumerate(infection_summary.iterrows()):
    ax2.annotate(f'n={row["Total_Cases"]}', 
                xy=(i, row['Infection_Rate_%']), 
                xytext=(0, 3), textcoords='offset points',
                ha='center', fontsize=9)

plt.tight_layout()
plt.show()

### Statistical significance testing - COVID-19 and Smoking Relationship

 Based on these findings, smoking status should be retained as a feature in our models, potentially with category consolidation for groups with small sample sizes.

In [None]:
chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)

print("--- Statistical Analysis ---")
print(f"Chi-squared statistic: {chi2_stat:.4f}")
print(f"p-value: {p_value:.6f}")
print(f"Degrees of freedom: {dof}")

if p_value < 0.05:
    print("Statistically significant relationship detected")
else:
    print("No statistically significant relationshi")

### Find other relevant correlations between feature pairs in data set

In [None]:

# Visualize correlation matrix
plt.figure(figsize=(12, 10))

# Create correlation heatmap for key variables (including age_numeric if available)
key_vars = ['age_numeric', 'height', 'weight', 'bmi', 'contacts_count', 'public_transport_count', 
           'worried', 'risk_infection', 'risk_mortality', 'covid19_positive',
           'asthma', 'heart_disease', 'diabetes', 'compromised_immune']

# Filter to available variables
available_key_vars = [var for var in key_vars if var in df.columns]
key_corr_matrix = df[available_key_vars].corr()

print(f"Correlation matrix includes {len(available_key_vars)} key variables")

# Create heatmap
mask = np.triu(np.ones_like(key_corr_matrix, dtype=bool), k=1)
sns.heatmap(key_corr_matrix, mask=mask, annot=True, cmap='RdBu_r', center=0,
           square=True, linewidths=0.5, fmt='.3f', 
           cbar_kws={"shrink": 0.8})

plt.title('Correlation Matrix: Key Variables', fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

### Age-Stratified Analysis: COVID-19 Risk Patterns

To better understand risk factors, we examine how COVID-19 infection rates vary across age groups and smoking categories.

In [None]:
# Create age_numeric if it doesn't already exist
if 'age_numeric' not in df.columns:
    age_mapping = {
        '0_10': 5, '10_20': 15, '20_30': 25, '30_40': 35, '40_50': 45,
        '50_60': 55, '60_70': 65, '70_80': 75, '80_90': 85, '90_plus': 95
    }
    df['age_numeric'] = df['age'].map(age_mapping)
    print("Created age_numeric variable for analysis")
else:
    print("Using existing age_numeric variable")

# Analyze infection rates by age group
age_analysis = df.groupby('age').agg({
    'covid19_positive': ['count', 'sum', 'mean'],
    'risk_mortality': ['mean', 'std']
}).round(4)

age_analysis.columns = ['total_cases', 'covid_positive', 'infection_rate', 
                       'mean_mortality_risk', 'std_mortality_risk']
age_analysis['infection_rate_pct'] = (age_analysis['infection_rate'] * 100).round(2)

print("--- Age-Stratified COVID-19 Analysis ---")
print(age_analysis.to_string())

### Visualize age-related patterns

In [None]:

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# COVID-19 infection rates by age
bars1 = ax1.bar(age_analysis.index, age_analysis['infection_rate_pct'], 
               color='crimson', alpha=0.7)
ax1.set_title('COVID-19 Infection Rate by Age Group', fontweight='bold')
ax1.set_xlabel('Age Group')
ax1.set_ylabel('Infection Rate (%)')
ax1.tick_params(axis='x', rotation=45)

# Add sample size annotations
for i, (idx, row) in enumerate(age_analysis.iterrows()):
    total = int(row['total_cases'])
    rate = row['infection_rate_pct']
    ax1.annotate(f'n={total}', xy=(i, rate), xytext=(0, 3),
                textcoords='offset points', ha='center', fontsize=9)

# Mortality risk by age
ax2.errorbar(x=range(len(age_analysis)), y=age_analysis['mean_mortality_risk'],
            yerr=age_analysis['std_mortality_risk'], fmt='o-', capsize=5,
            color='navy', linewidth=2, markersize=8)
ax2.set_title('Mortality Risk by Age Group', fontweight='bold')
ax2.set_xlabel('Age Group')
ax2.set_ylabel('Mortality Risk (Mean ± SD)')
ax2.set_xticks(range(len(age_analysis)))
ax2.set_xticklabels(age_analysis.index, rotation=45)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### Correlation Analysis: COVID-19 Impact on Variable Relationships

This part eamines how variable correlations differ between COVID-positive and COVID-negative populations to understand the disease's impact on risk factor relationships.

In [None]:

# Use key variables that are already processed for correlation analysis
# Focus on the most important variables identified in previous sections
analysis_vars = ['age_numeric', 'height', 'weight', 'bmi', 'contacts_count', 
                'public_transport_count', 'worried', 'risk_mortality',
                'asthma', 'heart_disease', 'diabetes', 'compromised_immune',
                'health_worker', 'nursing_home']

# Filter to variables that actually exist in our dataframe
available_analysis_vars = [var for var in analysis_vars if var in df.columns]

# Split data by COVID status using the cleaned dataframe
covid_positive_subset = df[df['covid19_positive'] == True][available_analysis_vars]
covid_negative_subset = df[df['covid19_positive'] == False][available_analysis_vars]

print("--- Correlation Analysis: COVID Impact ---")
print(f"COVID-positive cases: {len(covid_positive_subset)}")
print(f"COVID-negative cases: {len(covid_negative_subset)}")
print(f"Available variables: {', '.join(available_analysis_vars)}")

In [None]:
# Calculate correlation differences
corr_positive = covid_positive_subset.corr()
corr_negative = covid_negative_subset.corr()
corr_difference = corr_positive - corr_negative

# Create visualization
plt.figure(figsize=(12, 10))
sns.heatmap(corr_difference, annot=True, cmap='RdBu_r', center=0, 
           square=True, linewidths=0.5, fmt='.3f',
           cbar_kws={"label": "Correlation Difference\n(COVID+ minus COVID-)"})

plt.title('Impact of COVID-19 on Variable Correlations\n(Positive - Negative Difference)', 
         fontweight='bold', fontsize=14)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Identify largest correlation changes
mask = np.triu(np.ones_like(corr_difference, dtype=bool), k=1)
upper_triangle = corr_difference.where(mask)

largest_changes = []
for col in upper_triangle.columns:
    for idx in upper_triangle.index:
        if not pd.isna(upper_triangle.loc[idx, col]) and abs(upper_triangle.loc[idx, col]) > 0.1:
            largest_changes.append({
                'Variable_Pair': f"{idx} - {col}",
                'Correlation_Change': upper_triangle.loc[idx, col]
            })

if largest_changes:
    changes_df = pd.DataFrame(largest_changes).sort_values('Correlation_Change', 
                                                          key=abs, ascending=False)
    print("--- Largest Correlation Changes Due to COVID-19 ---")
    print(changes_df.to_string(index=False))