# PREDICTIVE ANALYSIS

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [6]:
df1=pd.read_excel("cleaned_schema_1_data_Final.xlsx")
df2=pd.read_excel("cleaned_schema_2_data_Final.xlsx")
df3=pd.read_excel("cleaned_schema_3_data_Final.xlsx")

In [8]:
df1.columns

Index(['month', 'fsa', 'probable', 'vulnerable', 'is_most_recent',
       'fever_chills_shakes', 'cough', 'shortness_of_breath',
       'any_medical_conditions', 'travel_outside_canada',
       'contact_with_illness', 'age_1', 'age_binary'],
      dtype='object')

In [10]:
df2.columns

Index(['month', 'fsa', 'probable', 'vulnerable', 'is_most_recent',
       'fever_chills_shakes', 'cough', 'shortness_of_breath',
       'any_medical_conditions', 'travel_outside_canada',
       'contact_with_illness', 'symptoms', 'conditions', 'ethnicity', 'sex',
       'needs', 'age_1_26-44', 'age_1_45-64', 'age_1_<26', 'age_1_>65'],
      dtype='object')

In [12]:
df3.columns

Index(['month', 'fsa', 'probable', 'vulnerable', 'fever_chills_shakes',
       'cough', 'shortness_of_breath', 'any_medical_conditions',
       'travel_outside_canada', 'contact_with_illness', 'contact_in_household',
       'tested', 'covid_results_date', 'covid_positive', 'conditions',
       'ethnicity', 'sex', 'needs', 'mental_health_impact',
       'travel_work_school', 'self_isolating', 'media_channels',
       'financial_obligations_impact', 'tobacco_usage', 'symp_chills',
       'symp_cough', 'symp_diarrhea', 'symp_diarrhee', 'symp_fever',
       'symp_lossOfSmellTaste', 'symp_none', 'symp_noneOfTheAbove',
       'symp_other', 'symp_runnyNose', 'symp_shakes', 'symp_shortnessOfBreath',
       'symp_soreThroat', 'symp_stomachPainCramps', 'age_1_26-44',
       'age_1_45-64', 'age_1_<26', 'age_1_>65'],
      dtype='object')

In [35]:
#1. Feature Selection
# We exclude 'probable' (it's our target) and 'is_most_recent' (usually metadata)
feature_cols = [
    'month', 'vulnerable', 'fever_chills_shakes', 'cough', 
    'shortness_of_breath', 'any_medical_conditions', 
    'travel_outside_canada', 'contact_with_illness', 'age_binary'
]

In [37]:
# 2. Data Preprocessing (One-Hot Encoding)
# This converts categorical text like 'month' into numbers the model can use
X = pd.get_dummies(df1[feature_cols], drop_first=True)
y = df1['probable']

In [39]:
# 3. Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [41]:
# 4. Initialize Model with Class Balancing
# 'class_weight=balanced' is the magic fix for your 0.00 recall issue!
model = LogisticRegression(class_weight='balanced', max_iter=1000)
model.fit(X_train, y_train)

In [43]:
# 5. Evaluation
y_pred = model.predict(X_test)
print("Model Performance")
print(classification_report(y_test, y_pred))

--- Balanced Model Performance ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     68215
           1       1.00      1.00      1.00      3469

    accuracy                           1.00     71684
   macro avg       1.00      1.00      1.00     71684
weighted avg       1.00      1.00      1.00     71684



In [55]:
# 1. Prepare Features (X) and Target (y)
# We use get_dummies to handle categorical 'age_1' data
features = ['shortness_of_breath', 'any_medical_conditions', 'tobacco_usage', 'age_1_26-44',
       'age_1_45-64', 'age_1_<26', 'age_1_>65']
X = pd.get_dummies(df3[features], drop_first=True)
y = df3['probable']

# 2. Split data into Training (80%) and Testing (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Initialize and Fit the Model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# 4. Predictions
y_pred = log_reg.predict(X_test)
y_prob = log_reg.predict_proba(X_test)[:, 1] # Probability of being 'probable'

# 5. Output Results
print("--- Model Performance ---")
print(classification_report(y_test, y_pred))

--- Model Performance ---
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      3064
           1       0.00      0.00      0.00        43

    accuracy                           0.99      3107
   macro avg       0.49      0.50      0.50      3107
weighted avg       0.97      0.99      0.98      3107



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
