# PREDICTIVE ANALYSIS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score


# 1. LOAD THE DATASETS



In [3]:
files = [
    'Team8_PyMasters_Cleaned_data1.xlsx',
    'Team8_PyMasters_Cleaned_data2.xlsx',
    'Team8_PyMasters_Cleaned_data3.xlsx'
]

dfs = [pd.read_excel(f) for f in files]
df1, df2, df3 = dfs

# 2. DATA COMBINATION & CLEANING

# Define common clinical features for modeling

In [7]:

common_cols = [
    'month', 'probable', 'vulnerable', 'fever_chills_shakes', 'cough', 
    'shortness_of_breath', 'any_medical_conditions', 
    'travel_outside_canada', 'contact_with_illness', 'age_binary'
]

def prepare_and_align(df, cols):
    temp = df.copy()
    # Add missing columns as 0 (absence of symptom/condition)
    for col in cols:
        if col not in temp.columns:
            temp[col] = 0
    # Select only the target columns and drop any existing NaNs
    return temp[cols].dropna()

# Combine all datasets vertically
df_model = pd.concat([prepare_and_align(d, common_cols) for d in dfs], axis=0)

# Final safety check: fill any stray NaNs with 0 for binary columns
df_model = df_model.fillna(0)


# 3. MODEL PREPARATION

In [10]:
X = df_model.drop('probable', axis=1)
y = df_model['probable']

# Sampling for performance (KNN and Neural Networks are resource-intensive)
if len(df_model) > 50000:
    df_sample = df_model.sample(50000, random_state=42)
    X_final = df_sample.drop('probable', axis=1)
    y_final = df_sample['probable']
else:
    X_final, y_final = X, y

X_train, X_test, y_train, y_test = train_test_split(
    X_final, y_final, test_size=0.2, random_state=42, stratify=y_final
)



# 4. BUILDING ROBUST PIPELINES (Handles NaNs automatically)

In [13]:

numeric_features = [c for c in X.columns if c != 'month']
categorical_features = ['month']

# Preprocessing step with Imputer
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# Define the three models
models = {
    "Logistic Regression": LogisticRegression(class_weight='balanced', max_iter=1000),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Neural Network": MLPClassifier(hidden_layer_sizes=(32, 16), max_iter=500, random_state=42)
}



# 5. EXECUTION AND EVALUATION

In [16]:
for name, clf in models.items():
    # Create the full pipeline
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                     ('classifier', clf)])
    
    # Train the model
    model_pipeline.fit(X_train, y_train)
    
    # Predict and evaluate
    preds = model_pipeline.predict(X_test)
    print(f"\n--- {name} Performance ---")
    print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")
    print(classification_report(y_test, preds))




--- Logistic Regression Performance ---
Accuracy: 0.9989
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9548
           1       0.98      1.00      0.99       452

    accuracy                           1.00     10000
   macro avg       0.99      1.00      0.99     10000
weighted avg       1.00      1.00      1.00     10000


--- KNN Performance ---
Accuracy: 0.9990
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9548
           1       0.98      1.00      0.99       452

    accuracy                           1.00     10000
   macro avg       0.99      1.00      0.99     10000
weighted avg       1.00      1.00      1.00     10000


--- Neural Network Performance ---
Accuracy: 0.9990
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      9548
           1       0.98      1.00      0.99       452

    accuracy                 