# Predicting COVID-19 Severity

#Step 1: Import Libraries
## This code block imports essential libraries used throughout the notebook.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle  # For saving models

sns.set(rc={'figure.figsize': (14, 8)}, font_scale=0.9)

# Step 2: Load and Explore Dataset
## Load the dataset from CSV and perform initial exploration to understand the structure of the data.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load the dataset
file_path = "/content/drive/MyDrive/BINA PROJECT/DATA/Filtered_COVID_Severity_Dataset.csv"
df = pd.read_csv(file_path)

In [None]:
# Display data info
print("Data Shape:", df.shape)
display(df.head())
display(df.describe())

Data Shape: (237600, 23)


Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,...,Age_20-24,Age_25-59,Age_60+,Gender_Female,Gender_Male,Gender_Transgender,Contact_Dont-Know,Contact_No,Contact_Yes,Severity
0,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,0
1,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,0,1,0,0
2,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,1,0,0,0
3,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,0,0,1,1
4,1,1,1,1,1,0,1,1,1,1,...,0,0,0,0,1,0,0,1,0,1


Unnamed: 0,Fever,Tiredness,Dry-Cough,Difficulty-in-Breathing,Sore-Throat,None_Sympton,Pains,Nasal-Congestion,Runny-Nose,Diarrhea,...,Age_20-24,Age_25-59,Age_60+,Gender_Female,Gender_Male,Gender_Transgender,Contact_Dont-Know,Contact_No,Contact_Yes,Severity
count,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,...,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0,237600.0
mean,0.3125,0.5,0.5625,0.5,0.3125,0.0625,0.363636,0.545455,0.545455,0.363636,...,0.2,0.2,0.2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333,1.0
std,0.463513,0.500001,0.496079,0.500001,0.463513,0.242062,0.481047,0.497931,0.497931,0.481047,...,0.400001,0.400001,0.400001,0.471406,0.471406,0.471406,0.471406,0.471406,0.471406,0.816498
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.5,1.0,0.5,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


# Step 3: Prepare Features and Target
## Select predictors and target column, handle missing values, and encode categorical data if necessary.


In [None]:
# Define features and target
indicators = [
    'Fever', 'Tiredness', 'Dry-Cough', 'Difficulty-in-Breathing',
    'Sore-Throat', 'Pains', 'Nasal-Congestion', 'Runny-Nose', 'Diarrhea',
    'Gender_Male', 'Gender_Female', 'Gender_Transgender', 'Age_0-9','Age_10-19', 'Age_20-24','Age_25-59','Age_60+'
]

X = df[indicators]
y = df['Severity']

In [None]:
# Handle missing values
print("Checking for Missing Values:")
print(X.isnull().sum())

Checking for Missing Values:
Fever                      0
Tiredness                  0
Dry-Cough                  0
Difficulty-in-Breathing    0
Sore-Throat                0
Pains                      0
Nasal-Congestion           0
Runny-Nose                 0
Diarrhea                   0
Gender_Male                0
Gender_Female              0
Gender_Transgender         0
Age_0-9                    0
Age_10-19                  0
Age_20-24                  0
Age_25-59                  0
Age_60+                    0
dtype: int64


# Step 4: Split Data
## Divide the dataset into training and testing subsets to evaluate model performance.


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
print("Training and testing split completed.")

Training and testing split completed.


# Step 5: Define Pipelines and Hyperparameter Tuning
## Create machine learning pipelines for different models and set up hyperparameter tuning using GridSearchCV.


In [None]:
pipelines = {
    "RandomForest": Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),
    "LogisticRegression": Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(random_state=42, max_iter=200))
    ]),
    "DecisionTree": Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler()),
        ('classifier', DecisionTreeClassifier(random_state=42))
    ])
}

In [None]:
param_grids = {
    "RandomForest": {
        'classifier__n_estimators': [50, 100, 200],
        'classifier__max_depth': [None, 10, 20],
        'classifier__min_samples_split': [2, 5],
        'classifier__min_samples_leaf': [1, 2, 5]
    },
    "LogisticRegression": {
        'classifier__penalty': ['l1', 'l2'],
        'classifier__C': [0.01, 0.1, 1, 10],
        'classifier__solver': ['liblinear', 'saga']
    },
    "DecisionTree": {
        'classifier__criterion': ['gini', 'entropy'],
        'classifier__max_depth': [None, 5, 10, 20],
        'classifier__min_samples_split': [2, 5, 10]
    }
}

# Step 6: Model Training
## Train models with hyperparameter tuning using GridSearchCV and save the best models for future use.


In [None]:
# For saving models and training results
best_models = {}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Load previous model
try:
    with open("best_model.pkl", "rb") as f:
        previous_model = pickle.load(f)
    print("Loaded previous best model successfully.")
except FileNotFoundError:
    print("No previous model found. Training from scratch.")
    previous_model = None  # Set to None if no previous model is available

# Train models
for model_name, pipeline in pipelines.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[model_name],
        cv=skf,
        scoring='accuracy',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_

    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy for {model_name}: {grid_search.best_score_:.4f}")

No previous model found. Training from scratch.
Training RandomForest...
Best Parameters for RandomForest: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best Cross-Validation Accuracy for RandomForest: 0.2830
Training LogisticRegression...
Best Parameters for LogisticRegression: {'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'liblinear'}
Best Cross-Validation Accuracy for LogisticRegression: 0.3304
Training DecisionTree...
Best Parameters for DecisionTree: {'classifier__criterion': 'gini', 'classifier__max_depth': 5, 'classifier__min_samples_split': 2}
Best Cross-Validation Accuracy for DecisionTree: 0.3240


# Step 7: Evaluate Models
## Test the best models on the test set and compare their performance.


In [None]:
for model_name, model in best_models.items():
    print(f"\n=== Evaluating {model_name} ===")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Test Accuracy: {accuracy:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



=== Evaluating RandomForest ===
Test Accuracy: 0.2699
Confusion Matrix:
 [[6242 8868 8650]
 [8518 6583 8659]
 [8567 8782 6411]]
Classification Report:
               precision    recall  f1-score   support

           0       0.27      0.26      0.27     23760
           1       0.27      0.28      0.27     23760
           2       0.27      0.27      0.27     23760

    accuracy                           0.27     71280
   macro avg       0.27      0.27      0.27     71280
weighted avg       0.27      0.27      0.27     71280


=== Evaluating LogisticRegression ===
Test Accuracy: 0.3291
Confusion Matrix:
 [[8125 8907 6728]
 [8402 8649 6709]
 [8324 8749 6687]]
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.34      0.33     23760
           1       0.33      0.36      0.35     23760
           2       0.33      0.28      0.30     23760

    accuracy                           0.33     71280
   macro avg       0.33      0.33  

# Step 8: Save the Best Model
## Save the best-performing model for later use.


In [None]:
best_model = max(best_models.items(), key=lambda x: accuracy_score(y_test, x[1].predict(X_test)))[1]

with open("best_model.pkl", "wb") as f:
    pickle.dump(best_model, f)
print("Best model saved as 'best_model.pkl'.")


Best model saved as 'best_model.pkl'.
