In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

sns.set(style="whitegrid", palette="muted", color_codes=True)

df = pd.read_csv("owid_covid_data_us_subset.csv")

df.head()

Unnamed: 0,date,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,icu_patients,hosp_patients,weekly_hosp_admissions,daily_case_change_rate,daily_death_change_rate,hospitalization_rate,icu_rate,case_fatality_rate,7day_avg_new_cases,7day_avg_new_deaths,hospitalization_need,icu_requirement
0,2020-07-21,3853351.0,60012.0,145801.0,932.0,11390.679,430.994,11458.0,42195.0,30552.0,,,1.095021,0.297352,3.783746,,,High,High
1,2020-07-22,3911870.0,58519.0,146668.0,867.0,11563.663,433.557,12487.0,46107.0,31352.0,1.518652,0.594646,1.178643,0.319208,3.749307,,,High,High
2,2020-07-23,3975206.0,63336.0,147861.0,1193.0,11750.887,437.084,13916.0,47834.0,31679.0,1.619072,0.813402,1.203309,0.35007,3.719581,,,High,High
3,2020-07-24,4047622.0,72416.0,149052.0,1191.0,11964.952,440.604,13627.0,46748.0,31870.0,1.821692,0.805486,1.15495,0.336667,3.682458,,,High,High
4,2020-07-25,4120764.0,73142.0,150306.0,1254.0,12181.163,444.311,14402.0,51831.0,32804.0,1.807036,0.841317,1.257801,0.349498,3.647527,,,High,High


In [None]:
#Function to find precision, recall, and F1 score
def pre_rec_f1(y_pred, y):
    eps = np.finfo(float).eps
    num_cls = len(np.unique(y))
    tp_sum, fp_sum, fn_sum, f1_sum = 0, 0, 0, 0
    tps, fps, fns, pre, rec, f1s = [], [], [], [], [], []
    for i in range(num_cls):
        y_pred_i = y_pred == i
        y_i = y == i
        tp = np.logical_and(y_pred_i, y_i).sum()
        fp = np.logical_and(y_pred_i, (~y_i)).sum()
        fn = np.logical_and((~y_pred_i), y_i).sum()
        tp_sum += tp
        fp_sum += fp
        fn_sum += fn
        f1 = (2 * tp) / (2 * tp + fp + fn + eps)
        f1_sum += f1
        pre.append(tp / (tp + fp + eps))
        rec.append(tp / (tp + fn + eps))
        tps.append(tp)
        fps.append(fp)
        fns.append(fn)
        f1s.append(f1)

    macrof1 = f1_sum / (num_cls + eps)
    microf1 = (2 * tp_sum) / (2 * tp_sum + fp_sum + fn_sum + eps)
    return macrof1, microf1, tps, fps, fns, pre, rec, f1s


### Task 1: Data Cleaning and Exploratory Analysis.
- Cleaning missing values, removing duplicates, and standardizing formats on original data and original data set.
- Exploratory analysis to identify trends, patterns, and anomalies.
- Visualization using Matplotlib and Seaborn.
- Line charts for time-series trends in total_cases, total_deaths, and hospitalization_rate.
- Histograms and boxplots for ICU rates and hospitalization distributions.
- Use StandardScaler to normalize the features before applying a classifier.

In [None]:
#Code for task 1

Hello


### Task 2: Correlation and Statistical Analysis (Data subset)
- Compute Pearson correlation coefficients between numerical variables, focusing on icu_requirement and other derived features.
- Create a heatmap to visualize the full correlation matrix, highlighting features with strong relationships (e.g., correlations > ±0.5)
- Scatter plots with regression lines for correlated features.
- Use Chi-Square tests to examine associations between categorical variables
- Tables summarizing ANOVA/Kruskal-Wallis results, with p-values and effect sizes for key variables.


In [None]:
#Code for task 2

### Task 3: Predictive Modeling: Decision Tree (Data Subset)
- Perform 5-fold cross-validation for each of the 5 max depths and compute accuracy, precision, recall, macro-F1 and micro-F1 and find which max depth works best on predicting the icu requirement.
- Testing max_depth with 3, 5, 10, 15, 20.
- Use features like icu_rate, case_fatality_rate, hospitalization_rate, etc., to predict how many ICU patients will be needed on a given day.
- Visualize the overall best Decision Tree for ICU Patient Prediction and analyze the trade off between different tree depth choices (General Geni/Entropy).
- Performance metric comparisons.


In [None]:
#Code for task 3

### Task 4: Predictive Modeling: KNN (Data Subset)
- Perform 5-fold cross-validation for each of the 5 kernel functions and compute accuracy, precision, recall, macro-F1, and micro-F1 on predicting the icu requirement and research which kernel function works the best on predicting the icu requirement.
- Using different numbers of neighbors (e.g. 2, 5, 10, 50)
- Analyze how the different numbers of neighbors affect the performance of the model.
- Analyze the trade-off between different numbers of neighbor's choices.
- Performance metric comparisons. 

In [None]:
#Code for task 4

### Task 5: Predictive Modeling: SVM (Data Subset)
- Perform 5-fold cross-validation for each of the 4 kernel functions and compute accuracy, precision, recall, macro-F1, and micro-F1 on predicting the icu requirement and research which kernel function works the best.
- Using different kernels: Linear, Polynomials, RBF, Sigmoids.
- Compare kernel function performances and explain their impact.
- Analyze trade-offs between different kernel choices.
- Performance metric comparisons.

In [None]:
#Code for task 5

### Task 6: Model Comparison Methodology (Data Subset)
- Compare best-performing models from Decision Tree, KNN, and SVM.
- Training Time: Record and analyze the computational complexity and efficiency of each model.
- Performance Metrics: Compute and compare accuracy, precision, recall, macro-F1, and micro-F1 across models.
- Analyze the outcome of the comparison.

In [None]:
#Code for task 6

### Task 7: Regional Pattern Analysis and Comparison (On the continent attribute of the Full data set)
- Investigate patterns and trends across different regions or states in the original full dataset.
- Segment the data by geographical regions (continent).
- Use clustering techniques (e.g., K-Means or hierarchical clustering) to group regions with similar COVID-19 characteristics,  total_cases, total_deaths, hosp_patients.etc

In [None]:
#Code for task 7


### Task 8: Advanced Feature Derivation and Preprocessing (Full dataset)
- Generate time-based features such as infection growth rates, recovery rates, and rolling averages.
- Apply quantile-based discretization for continuous variables (e.g., hospitalization rates).
- Use Principal Component Analysis (PCA) to reduce dimensionality and derive composite features (e.g., healthcare capacity).

In [None]:
#Code for task 8

### Task 9: Time Series Forecasting Using Deep Learning
- Forecast key metrics (e.g., ICU requirements, total cases) using state-of-the-art time series models.
- Implement Long Short-Term Memory (LSTM) networks to capture temporal dependencies.
- Evaluate models using metrics such as Mean Absolute Error (MAE) and Root Mean Squared Error (RMSE).

In [None]:
#Code for task 9