In [20]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
#Problem Statement : How many people are more at risk for covid based on their location(Based on NYC boroughs)

In [21]:
#load the dataset
covid_data = pd.read_csv('/content/COVID-19_Daily_Counts_of_Cases__Hospitalizations__and_Deaths.csv')

In [22]:
covid_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1915 entries, 0 to 1914
Data columns (total 55 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   date_of_interest                 1915 non-null   object
 1   CASE_COUNT                       1915 non-null   int64 
 2   PROBABLE_CASE_COUNT              1915 non-null   int64 
 3   HOSPITALIZED_COUNT               1915 non-null   int64 
 4   DEATH_COUNT                      1915 non-null   int64 
 5   CASE_COUNT_7DAY_AVG              1915 non-null   int64 
 6   ALL_CASE_COUNT_7DAY_AVG          1915 non-null   int64 
 7   HOSP_COUNT_7DAY_AVG              1915 non-null   int64 
 8   DEATH_COUNT_7DAY_AVG             1915 non-null   int64 
 9   BX_CASE_COUNT                    1915 non-null   int64 
 10  BX_PROBABLE_CASE_COUNT           1915 non-null   int64 
 11  BX_HOSPITALIZED_COUNT            1915 non-null   int64 
 12  BX_DEATH_COUNT                   1

In [23]:
#Loads the dataset
covid_data.describe()
covid_data.head()

Unnamed: 0,date_of_interest,CASE_COUNT,PROBABLE_CASE_COUNT,HOSPITALIZED_COUNT,DEATH_COUNT,CASE_COUNT_7DAY_AVG,ALL_CASE_COUNT_7DAY_AVG,HOSP_COUNT_7DAY_AVG,DEATH_COUNT_7DAY_AVG,BX_CASE_COUNT,...,SI_CASE_COUNT,SI_PROBABLE_CASE_COUNT,SI_HOSPITALIZED_COUNT,SI_DEATH_COUNT,SI_PROBABLE_CASE_COUNT_7DAY_AVG,SI_CASE_COUNT_7DAY_AVG,SI_ALL_CASE_COUNT_7DAY_AVG,SI_HOSPITALIZED_COUNT_7DAY_AVG,SI_DEATH_COUNT_7DAY_AVG,INCOMPLETE
0,02/29/2020,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,03/01/2020,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,03/02/2020,0,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,03/03/2020,1,0,8,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,03/04/2020,5,0,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
boroughs = {
    "Bronx": ["BX_CASE_COUNT", "BX_HOSPITALIZED_COUNT", "BX_DEATH_COUNT"],
    "Brooklyn": ["BK_CASE_COUNT", "BK_HOSPITALIZED_COUNT", "BK_DEATH_COUNT"],
    "Manhattan": ["MN_CASE_COUNT", "MN_HOSPITALIZED_COUNT", "MN_DEATH_COUNT"],
    "Queens": ["QN_CASE_COUNT", "QN_HOSPITALIZED_COUNT", "QN_DEATH_COUNT"],
    "Staten Island": ["SI_CASE_COUNT", "SI_HOSPITALIZED_COUNT", "SI_DEATH_COUNT"]
}

long_format_data = []

for borough, cols in boroughs.items():
    borough_df = covid_data[["date_of_interest", cols[0], cols[1], cols[2]]].copy()
    borough_df.columns = ["date", "cases", "hospitalizations", "deaths"]
    borough_df["borough"] = borough
    long_format_data.append(borough_df)

# Combine all into one DataFrame
df_long = pd.concat(long_format_data, ignore_index=True)
df_long["date"] = pd.to_datetime(df_long["date"])  # Convert to datetime


In [25]:
# Extract month and weekday from date
df_long['month'] = df_long['date'].dt.month
df_long['weekday'] = df_long['date'].dt.dayofweek  # 0 = Monday, 6 = Sunday

# One-hot encode the borough column (convert text to binary columns)
df_long = pd.get_dummies(df_long, columns=['borough'], drop_first=True)


In [26]:


# Define features and target
X = df_long.drop(['date', 'deaths'], axis=1)  # Drop 'deaths' and 'date' since we don't predict by exact date
y = df_long['deaths']

# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [27]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [28]:

#evaluates how well it did
#Average squared error
#R^2 score
mse = mean_squared_error(y_test, y_pred) #mean squared error tells you how wrong your model is
r2 = r2_score(y_test, y_pred)# tells you how useful your model is

print("Mean Squared Error:", mse)
print("R^2 Score", r2)


Mean Squared Error: 88.30950826672422
R^2 Score 0.6825670657814715
