In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# I left my question here to give you some reference:

Author: Wesley Oke

# Question:
## Can Machine Learning predict how neighbourhoods will be impacted by COVID?

Approach:
1. Calculate the total number of COVID cases for each neighbourhood.
2. Identify or create appropriate neighbourhoood features that might impact the number of cases (density, wealth, age data, etc).
3. Apply linear regression, to determine how the various neighbourhood features (risk factors) that impact the number of COVID in the neighbourhood.
4. Apply neural networks and random forests to determine how well they can predict the number of cases in each neighbourhood.

## Load Toronto's COVID data

In [None]:
# you can go to https://github.com/oke-w/covid to see the data sets that are used below

In [None]:
cov19 = pd.read_csv('https://raw.githubusercontent.com/oke-w/covid/main/COVID19%20cases.csv',
                    true_values=['Yes'], false_values=['No'])
cov19.rename(columns={'Neighbourhood Name':'Neighbourhood'}, inplace=True)

Note, the nbhd source data Neighbourhood Names need to be edited as follows so they match the Neighbourhood data set:
- "Danforth East York"
- "Weston-Pelham Park"
- "Briar Hill-Belgravia"

In [None]:
cov19['Neighbourhood'] = cov19['Neighbourhood'].map(lambda x:
                    "Weston-Pelham Park" if x=="Weston-Pellam Park" else x)
cov19['Neighbourhood'] = cov19['Neighbourhood'].map(lambda x:
                    "Briar Hill-Belgravia" if x=="Briar Hill - Belgravia" else x)
cov19['Neighbourhood'] = cov19['Neighbourhood'].map(lambda x:
                    "Danforth East York" if x=="Danforth-East York" else x)

In [None]:
cov19.info()

In [None]:
# create a Series of 140 Neighbourhoods with the count of COVID cases in each
# drop any cases that don't have an identified Neighbourhood
cov_count = cov19.dropna(subset=['Neighbourhood']).groupby('Neighbourhood')['_id'].count()

In [None]:
# create a DF of 140 Neighbourhoods with COVID and hospitalized counts
hosp_count = cov19.groupby('Neighbourhood').sum().loc[:,['Ever Hospitalized']]
cov_counts = pd.concat([cov_count,hosp_count],axis=1)
cov_counts.columns = 'cov_counts', 'hosp_counts'

In [None]:
cov_counts

In [None]:
cov_counts.info()

## Load Toronto 2016 Neighbourhood data

In [None]:
git = 'https://raw.githubusercontent.com/oke-w/covid/main/neighbourhood-profiles-2016-cleaned-short.csv'
nbhd = pd.read_csv(git, header=[2], index_col=[0], thousands=',')
nbhd.rename(columns={'Neighbourhood Name':'Neighbourhood'}, inplace=True)
nbhd.rename(columns={'Population, 2016':'Population'}, inplace=True)

In [None]:
nbhd.info()

In [None]:
nbhd.head()

## Merge COVID & Neighbourhood data sets

In [None]:
# join the covid and neighbourhood data together based on the neighbourhood name
nbhd_cov = pd.merge(cov_counts, nbhd, left_index=True, right_on='Neighbourhood')

In [None]:
# all of the columns from 'Children (0-14 years)' onward are population counts
# normalize these population-related data by population size of neighbourhood
x1 = nbhd_cov.iloc[:,7:].apply(lambda x: x/nbhd_cov['Population'])
nbhd_cov_n = pd.concat([nbhd_cov.iloc[:,:7], x1], axis=1)
nbhd_cov_n # should have 140 rows (one for each neighbourhood in Toronto)

In [None]:
nbhd_cov_n.info()

## Simple Linear Regression: Population

In [None]:
# carry out simple Linear Regression with single feature, neighbourhood "Population, 2016"
X_OLS1 = nbhd_cov.Population.values.reshape(-1, 1) # convert single column to array as required for model
y = nbhd_cov.cov_counts

In [None]:
# predict the number of COVID cases in Toronto neighbourhoods based only on Population trend line
nbhdpop_cov_LR = LinearRegression().fit(X_OLS1, y)
y_pred = nbhdpop_cov_LR.predict(X_OLS1)
nbhdpop_cov_LR_mse = mean_squared_error(y, y_pred)
nbhdpop_cov_LR_rmse = np.sqrt(nbhdpop_cov_LR_mse)
print(nbhdpop_cov_LR_rmse)

In [None]:
# check the statistical validity of the model
OLS_simple = sm.OLS(y, X_OLS1).fit()
OLS_simple.summary()

-----
This result (P=0) indicates that Population is a statistically significant predictor of the number of COVID cases, as expected.

In [None]:
# plot each Neighborhood's COVID count against its population
fig, ax = plt.subplots(figsize=(16,8))
nbhd_cov.plot.scatter("Population", "cov_counts", ax=ax,
                      title='Count of COVID Cases vs. Population for each Toronto Neighbourhood');
# plot linear regression trend line
plt.plot(X_OLS1, nbhdpop_cov_LR.predict(X_OLS1), 'r-');