In [None]:
import boto3

In [None]:
s3 = boto3.resource('s3')

for bucket in s3.buckets.all():
    print(bucket.name)

In [None]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

import os # read dotenv values
BUCKET_NAME = os.environ.get("BUCKET_NAME")
FILE_NAME = os.environ.get("CSV_NAME")
print(BUCKET_NAME)
print(FILE_NAME)
# read the content of data bucket
bucket = s3.Bucket(BUCKET_NAME)


# list all files in this bucket
for obj in bucket.objects.all():
    print(obj.key)


In [None]:
# read csv from s3
import csv
from urllib.parse import urlparse
from io import BytesIO
import pandas as pd

def from_s3(s3_uri:str) -> pd.DataFrame:
    client = boto3.client("s3")
    parsed_s3 = urlparse(s3_uri)
    path= parsed_s3.path[1:]
    obj = client.get_object(Bucket=parsed_s3.netloc, Key=path)
    csv_in_bytes = BytesIO(obj["Body"].read())
    print(obj["Body"])
    return pd.read_csv(csv_in_bytes)


# read data from s3 bucket
data_location = f"s3://{BUCKET_NAME}/{FILE_NAME}"
df = from_s3 (data_location)



In [None]:
# initial data exploration to test if s3 is working
df.head(100)


In [None]:
df.info(show_counts=True)

In [None]:
pd.options.display.max_columns = df.shape[1]
print(df.shape)
df.describe()

In [None]:
cols_to_drop= ["City or Regency", "Time Zone", "Country", "Continent", "Province", "Location ISO Code", "Total Regencies", "Island", "Special Status", "Longitude", "Latitude", "Location Level", "Area (km2)"]
df = df.drop(cols_to_drop, axis=1)
print(df.shape)

In [None]:
# drop indonisa columns, 
indo_rows_to_drop = df.loc[df["Location"] == "Indonesia"]
indo_rows_to_drop.head() 
index_to_delete = indo_rows_to_drop.index
df.drop(index_to_delete, inplace=True)
df.head(20)

In [None]:
# clean total rural village for jakarta
# jakarta = df.loc[df["Location"] == "DKI Jakarta"]
# jakarta["Total Rural Villages"] = 0
# df.loc["Location","DKI Jakarta"] = jakarta

df.loc[df["Location"] == "DKI Jakarta", "Total Rural Villages"] = 0

df.head(20)


In [None]:
# clean percentage sign
col_with_percent = ["Case Fatality Rate", "Case Recovered Rate"]
print(df[col_with_percent].head())
df[col_with_percent] = df[col_with_percent].apply( lambda s: s.str.rstrip('%').astype(float) / 100.0)
df.head()


In [None]:
# clean na data
df.isna().sum()


In [None]:
# fill total city and total urban villages to 0 if there is nan
df ["Total Urban Villages"] = df["Total Urban Villages"].fillna(0)
df ["Total Cities"] = df ["Total Cities"].fillna(0)

# compare result
df.isna().sum()


In [None]:
# interporate missing growth factor data
df = df.interpolate(method ='linear', limit_direction ='forward')
df.loc[0, "Growth Factor of New Cases"]= 0
df.loc[0, "Growth Factor of New Deaths"]= 0
# round off 2 decimals
df["Growth Factor of New Cases"] = df["Growth Factor of New Cases"].apply(lambda x: round (x,3))
df["Growth Factor of New Deaths"] = df["Growth Factor of New Deaths"].apply(lambda x: round (x,3))
df.head(10)

In [None]:
# format the date
df["Date"] = pd.to_datetime(df["Date"])
df.head()

In [None]:
# upload to bucket
# save the cleaned csv locally
result_file_name: str = os.environ.get("CSV_CLEANED_NAME", "result")
path=f"{os.getcwd()}/{result_file_name}.csv"
print(path)
df.to_csv(path)

# upload csv to s3 bucket 
def to_s3(s3_uri:str, object_name:str = None) -> pd.DataFrame:
    client = boto3.client("s3")
    parsed_s3 = urlparse(s3_uri)
    file_name= parsed_s3.path[1:]
    if object_name is None:
        object_name = file_name
    obj = client.upload_file(file_name, Bucket=parsed_s3.netloc,Key=object_name)
    print(obj)


# read data from s3 bucket
upload_location = f"s3://{BUCKET_NAME}/{result_file_name}.csv"
to_s3 (upload_location)


In [None]:
# initial graphing of data
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
n_by_date = df.groupby('Date')[["Total Cases", "Total Deaths"]].sum()
n_by_date.head(10)



In [None]:
import numpy as np

fig, ax = plt.subplots(figsize=(12, 6))

x = n_by_date["Total Cases"]
y = n_by_date["Total Deaths"]

x_scaler=10
x_max_scaled = x.max() /x_scaler

# Plot linear sequence, and set tick labels to the same color
ax.set_ylim(0,x_max_scaled)
ax.plot(y, color='red', label = "Total Death")
ax.tick_params(axis='y', labelcolor='red')

# Generate a new Axes instance, on the twin-X axes (same position)
ax2 = ax.twinx()

# Plot exponential sequence, set scale to logarithmic and change tick color
ax2.plot(x, color='blue', label= "Total Cases")
ax2.tick_params(axis='y', labelcolor='blue')
ax2.ticklabel_format(useOffset=False, axis="y", style="plain")

ax2.set_ylabel('Total Cases')
ax2.set_xlabel("Total Death")
ax.set_xlabel('Date (by month)')

fig.legend(loc = 'upper left')
plt.title("Total Death and Total Cases by month")
plt.show()

In [None]:
# new death and cases
n_by_date = df.groupby('Date')[["New Cases", "New Deaths"]].sum()
n_by_date.head(50)


fig, ax = plt.subplots(figsize=(12, 6))

x_label, y_label = ["New Cases", "New Deaths"]

x = n_by_date[x_label]
y = n_by_date[y_label]

x_scaler=10
x_max_scaled = x.max() /x_scaler

# Plot linear sequence, and set tick labels to the same color
ax.set_ylim(0,x_max_scaled)
ax.plot(y, color='red', label = y_label)
ax.tick_params(axis='y', labelcolor='red')

# Generate a new Axes instance, on the twin-X axes (same position)
ax2 = ax.twinx()

# Plot exponential sequence, set scale to logarithmic and change tick color
ax2.plot(x, color='blue', label= x_label)
ax2.tick_params(axis='y', labelcolor='blue')
ax2.ticklabel_format(useOffset=False, axis="y", style="plain")

ax2.set_ylabel(x_label)
ax.set_ylabel(y_label)
ax.set_xlabel('Date (by month)')

fig.legend(loc = 'upper left')
plt.title(f"{x_label} and {y_label} by month")
plt.show()


In [None]:
# group data by region 

n_by_date = df.groupby(['Date', 'Location'])[["New Cases", "New Deaths"]].sum()
print(type(n_by_date))
print(n_by_date.columns)

# all unique locations
# all_locations = df["Location"].unique()
# print(all_locations)
print(n_by_date.loc[n_by_date['New Cases']==n_by_date['New Cases'].max()])

fig, ax = plt.subplots(figsize=(20,15))
n_by_date["New Cases"].unstack().plot(ax=ax)

plt.show()

In [None]:
# new death by locations

fig, ax = plt.subplots(figsize=(20,15))
n_by_date["New Deaths"].unstack().plot(ax=ax)
print(n_by_date.loc[n_by_date['New Deaths']==n_by_date['New Deaths'].max()])

plt.show()

In [None]:
# group data by region 

n_by_loc = df.groupby(['Location'])[["New Cases", "New Deaths"]].sum()


fig, ax = plt.subplots(figsize=(25, 20))

x_label, y_label = ["New Cases", "New Deaths"]

x = n_by_loc[x_label]
y = n_by_loc[y_label]

x_scaler=10
x_max_scaled = x.max() / x_scaler


# plot second bar chart on same graph
ind = np.arange(len(n_by_loc))
width = 0.4

# Plot linear sequence, and set tick labels to the same color
# ax.set_ylim(0,x_max_scaled)
ax.barh(ind, y*x_scaler, width, color='red', label = y_label)
ax.barh(ind+width, x, width, color='blue', label = x_label)
ax.tick_params(axis='x', labelcolor='blue')
ax.set(yticks=ind + width, yticklabels=n_by_loc.index, ylim=[2*width - 1, len(n_by_loc)])
ax.legend()


ax.set_xlabel(f"{x_label} by Location")


ax2 = ax.twiny()
ax2.set_xlim(0,y.max() * 1.2)
x_death_range = np.arange(0, x_max_scaled, 10000)
ax2.set_xticks(x_death_range)
ax2.tick_params(axis='x', labelcolor='red')
ax2.set_xlabel(f"{y_label} by Location")
              

for p in ax.patches:
    width, height = p.get_width(), p.get_height()
    x, y = p.get_xy() 
    ax.text(x+width, 
            y+height/2, 
            str(round(width/x_scaler)), 
            fontsize=8,
           color='grey')

plt.title(f"{x_label} and {y_label} by Location")

plt.show()

In [None]:
n_by_loc = df.groupby(['Location'])[["New Cases", "New Deaths"]].sum()
location_pop = df[['Location', 'Population Density']]
# get unique location's population density
location_pop=location_pop.drop_duplicates(subset=['Location'])

# combine dataframes
result_df = n_by_loc.merge(location_pop, how = 'inner', on = ['Location'])
result_df.head(40)

# bco efficients between data
import numpy as np
death = result_df['New Deaths']
case = result_df['New Cases']
density = result_df['Population Density']
r_death_loc = np.corrcoef(density, death)
print(f'the coefficient between density and death is {r_death_loc}')
r_death_case = np.corrcoef(case, death)
print(f'the coefficient between case and death is {r_death_case}')
r_case_loc = np.corrcoef(density, case)
print(f'the coefficient between density and case is {r_case_loc}')


In [None]:
n_by_loc = df.groupby(['Location'])[["New Cases", "New Deaths"]].sum()
urban_df = df[['Location', 'Total Cities', 'Total Districts', 'Total Urban Villages', 'Total Rural Villages']]
# get unique location's population density
urban_df = urban_df.drop_duplicates(subset=['Location'])

urban_df['rural_ratio'] = urban_df['Total Rural Villages'] / (urban_df['Total Rural Villages'] + urban_df['Total Districts'] + urban_df['Total Urban Villages'] + urban_df['Total Cities'])


urban_df['urban_ratio'] = 1- urban_df['rural_ratio']

urban_df.head()

# combine dataframes
result_df = n_by_loc.merge(urban_df, how = 'inner', on = ['Location'])
result_df.head(40)

# bco efficients between data
import numpy as np
death = result_df['New Deaths']
case = result_df['New Cases']
urban_rate = result_df['urban_ratio']
rural_rate = result_df['rural_ratio'] 

r_death_urban = np.corrcoef(urban_rate, death)
print(f'the coefficient between urban_rate and death is {r_death_urban}')

r_death_rural = np.corrcoef(rural_rate, death)
print(f'the coefficient between rural_rate and death is {r_death_rural}')

r_death_case = np.corrcoef(urban_rate, case)
print(f'the coefficient between urban_rate and case is {r_death_case}')
r_case_loc = np.corrcoef(rural_rate, case)
print(f'the coefficient between rural and case is {r_case_loc}')


In [None]:
n_by_loc = df.groupby(['Location'])[["New Cases", "New Deaths"]].sum()
location_pop = df[['Location', 'Population Density']]
# get unique location's population density
location_pop=location_pop.drop_duplicates(subset=['Location'])
result_df = result_df.merge(location_pop, how = 'inner', on = ['Location'])
result_df.head(40)

In [None]:
from scipy.stats.kde import gaussian_kde
from numpy import linspace
death = result_df['New Deaths']
median_death = death.median()
print(median_death)
# estimate the probability density function (PDF)
kde = gaussian_kde(death)
# return evenly spaced numbers over a specified interval
dist_space = linspace(min(death), max(death), int(median_death))
# plot the results
plt.plot(dist_space, kde(dist_space))
plt.ticklabel_format(useOffset=False, axis="y", style="plain")



In [None]:
df

In [None]:
# one hot encoding for data and location 
one_hot_location = pd.get_dummies(df['Location'])
# Drop column B as it is now encoded
df = df.drop('Location',axis = 1)
# Join the encoded df
col_to_drop =['Total Deaths per 100rb','Growth Factor of New Cases', 'Growth Factor of New Deaths', 'Total Deaths per Million',
              'Total Active Cases', 'Population', 'New Cases per Million', 'Total Cases per Million','New Deaths per Million']
df.head()
df = df.join(one_hot_location)
df = df.drop(col_to_drop, axis=1)


In [None]:
df.head()

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))

x_label, y_label = ["rural_ratio", "New Deaths"]

x = result_df[x_label]
y = result_df[y_label]

for i, txt in enumerate(result_df['Location']):
    ax.annotate(txt, (x[i], y[i]))

plt.scatter (x, y, s=80)
plt.title("rural area and total death")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))

x_label, y_label = ["urban_ratio", "New Deaths"]

x = result_df[x_label]
y = result_df[y_label]

plt.scatter (x, y)
plt.title ("urban area and total covid death")
plt.show()

In [None]:
rural_percentage = result_df.loc[result_df["rural_ratio"] > 0.7]
total_rural_locations = len(rural_percentage)

print(f"in indonisa {total_rural_locations / len(result_df) :.2%} percent are in rural area, as the city to village ratio is > 0.7")

In [None]:
result_df_new = result_df[result_df.Location != 'DKI Jakarta'] # drop capital data
result_df_new = result_df_new.reset_index()
result_df_new.head(20)

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))

x_label, y_label = ["urban_ratio", "Population Density"]

x = result_df_new[x_label]
y = result_df_new[y_label]

plt.scatter (x, y)
for i, txt in enumerate(result_df_new['Location']):
    ax.annotate(txt, (x[i], y[i]))
    
#add horizontal line at a medain value of y
median_pop_density = result_df_new["Population Density"].median()
median_urban_rate = result_df_new["urban_ratio"].median()
plt.axhline(y=median_pop_density)
plt.axvline(x=median_urban_rate)

plt.title ("urban area and population density")
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(25, 20))

x_label, y_label = ["urban_ratio", "Population Density"]

x = result_df_new[x_label]
y = result_df_new[y_label]

plt.scatter (x, y)
for i, txt in enumerate(result_df_new['Location']):
    ax.annotate(txt, (x[i], y[i]))
    
#add horizontal line at a medain value of y
median_pop_density = result_df_new["Population Density"].mean()
median_urban_rate = result_df_new["urban_ratio"].mean()
plt.axhline(y=median_pop_density)
plt.axvline(x=median_urban_rate)

plt.title ("urban area and population density")
plt.show()

In [None]:
# result_df = result_df[result_df.Location != 'DKI Jakarta'] # drop capital data
mean_pop_density = result_df_new["Population Density"].mean()
mean_urban_rate = result_df_new["urban_ratio"].mean()

print(f"median pop density: {median_pop_density} and median urban rate is : {median_urban_rate}")

# select 50% data from data that have pop density < median
mask_urban = result_df_new['urban_ratio'] < mean_urban_rate
left_percentile_urban =result_df_new[mask_urban]
right_percentile_urban = result_df_new[~mask_urban]

# further divide left data by median of pop density
mask_pop_left = left_percentile_urban['Population Density'] < mean_pop_density
mask_pop_right = right_percentile_urban['Population Density'] < mean_pop_density
left_percentile_urban_pop_top = left_percentile_urban[mask_pop_left]
left_percentile_urban_pop_down = left_percentile_urban[~mask_pop_left]

# spilt the value into 2 groups
left_case_down_train, left_case_down_test = np.array_split(left_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
left_case_top_train, left_case_top_test = np.array_split(left_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)

# same with value on right hand side
right_percentile_urban_pop_top = right_percentile_urban[mask_pop_right]
right_percentile_urban_pop_down = right_percentile_urban[~mask_pop_right]
right_case_down_train, right_case_down_test = np.array_split(right_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
right_case_top_train, right_case_top_test = np.array_split(right_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)


# train data left side
train_left = pd.concat([left_case_down_train,left_case_top_train])
print(len(train_left))

# train data right side
train_right = pd.concat([right_case_down_train,right_case_top_train])
print(len(train_right))

# train data left side
test_left = pd.concat([left_case_down_test,left_case_top_test])
print(len(test_left))

# train data right side
test_right = pd.concat([right_case_down_test,right_case_top_test])
print(len(test_right))

# only select certain data
test_case_x, test_case_y = test_left[["Population Density", "urban_ratio"]], test_left[["New Cases"]]
train_case_x, train_case_y = train_left[["Population Density", "urban_ratio"]], train_left[["New Cases"]]

# train a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(test_case_x, test_case_y)

r_sq = model.score(test_case_x, test_case_y)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}") # if we feed into the data of aceh this is the total infection 114734
print(f"slope: {model.coef_}") # when population increase, result increase by 2.47e+02 for pop density

y_pred = model.predict(train_case_x)
print(f"predicted response:\n{y_pred}")
print(train_case_y)
# accuracy of model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

r2_score = r2_score(train_case_y, y_pred)
mse = mean_squared_error(train_case_y, y_pred)
print (f"our accuracy score is {r2_score}")
print (f"our MSE is {mse}")
print(f"rmse is {np.sqrt(mse)}")

mae = mean_absolute_error(train_case_y, y_pred)
print(f"mae is {mae}")

In [None]:
# result_df = result_df[result_df.Location != 'DKI Jakarta'] # drop capital data
mean_pop_density = result_df_new["Population Density"].mean()
mean_urban_rate = result_df_new["urban_ratio"].mean()

print(f"median pop density: {median_pop_density} and median urban rate is : {median_urban_rate}")

# select 50% data from data that have pop density < median
mask_urban = result_df_new['urban_ratio'] < mean_urban_rate
left_percentile_urban =result_df_new[mask_urban]
right_percentile_urban = result_df_new[~mask_urban]

# further divide left data by median of pop density
mask_pop_left = left_percentile_urban['Population Density'] < mean_pop_density
mask_pop_right = right_percentile_urban['Population Density'] < mean_pop_density
left_percentile_urban_pop_top = left_percentile_urban[mask_pop_left]
left_percentile_urban_pop_down = left_percentile_urban[~mask_pop_left]

# spilt the value into 2 groups
left_case_down_train, left_case_down_test = np.array_split(left_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
left_case_top_train, left_case_top_test = np.array_split(left_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)

# same with value on right hand side
right_percentile_urban_pop_top = right_percentile_urban[mask_pop_right]
right_percentile_urban_pop_down = right_percentile_urban[~mask_pop_right]
right_case_down_train, right_case_down_test = np.array_split(right_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
right_case_top_train, right_case_top_test = np.array_split(right_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)


# train data left side
train_left = pd.concat([left_case_down_train,left_case_top_train])
print(len(train_left))

# train data right side
train_right = pd.concat([right_case_down_train,right_case_top_train])
print(len(train_right))

# train data left side
test_left = pd.concat([left_case_down_test,left_case_top_test])
print(len(test_left))

# train data right side
test_right = pd.concat([right_case_down_test,right_case_top_test])
print(len(test_right))

# only select certain data
test_case_x, test_case_y = test_left[["Population Density", "urban_ratio"]], test_left[["New Deaths"]]
train_case_x, train_case_y = train_left[["Population Density", "urban_ratio"]], train_left[["New Deaths"]]

# train a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(test_case_x, test_case_y)

r_sq = model.score(test_case_x, test_case_y)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}") # if we feed into the data of aceh this is the total infection 114734
print(f"slope: {model.coef_}") # when population increase, result increase by 2.47e+02 for pop density

y_pred = model.predict(train_case_x)
print(f"predicted response:\n{y_pred}")
print(train_case_y)
# accuracy of model
from sklearn.metrics import r2_score, mean_squared_error
r2_score = r2_score(train_case_y, y_pred)
mse = mean_squared_error(train_case_y, y_pred, multioutput='uniform_average')

print (f"our accuracy score is {r2_score}")
print (f"our MSE is {mse}")
print(f"rmse is {np.sqrt(mse)}")

mae = mean_absolute_error(train_case_y, y_pred)
print(f"mae is {mae}")

In [None]:
# result_df = result_df[result_df.Location != 'DKI Jakarta'] # drop capital data
mean_pop_density = result_df_new["Population Density"].mean()
mean_urban_rate = result_df_new["urban_ratio"].mean()

print(f"median pop density: {median_pop_density} and median urban rate is : {median_urban_rate}")

# select 50% data from data that have pop density < median
mask_urban = result_df_new['urban_ratio'] < mean_urban_rate
left_percentile_urban =result_df_new[mask_urban]
right_percentile_urban = result_df_new[~mask_urban]

# further divide left data by median of pop density
mask_pop_left = left_percentile_urban['Population Density'] < mean_pop_density
mask_pop_right = right_percentile_urban['Population Density'] < mean_pop_density
left_percentile_urban_pop_top = left_percentile_urban[mask_pop_left]
left_percentile_urban_pop_down = left_percentile_urban[~mask_pop_left]

# spilt the value into 2 groups
left_case_down_train, left_case_down_test = np.array_split(left_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
left_case_top_train, left_case_top_test = np.array_split(left_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)

# same with value on right hand side
right_percentile_urban_pop_top = right_percentile_urban[mask_pop_right]
right_percentile_urban_pop_down = right_percentile_urban[~mask_pop_right]
right_case_down_train, right_case_down_test = np.array_split(right_percentile_urban_pop_down.sample(frac=1, random_state=42), 2)
right_case_top_train, right_case_top_test = np.array_split(right_percentile_urban_pop_top.sample(frac=1, random_state=42), 2)


# train data left side
train_left = pd.concat([left_case_down_train,left_case_top_train])
print(len(train_left))

# train data right side
train_right = pd.concat([right_case_down_train,right_case_top_train])
print(len(train_right))

# train data left side
test_left = pd.concat([left_case_down_test,left_case_top_test])
print(len(test_left))

# train data right side
test_right = pd.concat([right_case_down_test,right_case_top_test])
print(len(test_right))

# only select certain data
test_case_x, test_case_y = test_left[["Population Density", "urban_ratio"]], test_left[["New Deaths"]]
train_case_x, train_case_y = train_left[["Population Density", "urban_ratio"]], train_left[["New Deaths"]]

# train a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(test_case_x, test_case_y)

r_sq = model.score(test_case_x, test_case_y)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}") # if we feed into the data of aceh this is the total infection 114734
print(f"slope: {model.coef_}") # when population increase, result increase by 2.47e+02 for pop density

y_pred = model.predict(train_case_x)
print(f"predicted response:\n{y_pred}")
print(train_case_y)
# accuracy of model
from sklearn.metrics import r2_score, mean_squared_error
r2_score = r2_score(train_case_y, y_pred)
mse = mean_squared_error(train_case_y, y_pred, multioutput='uniform_average')

print (f"our accuracy score is {r2_score}")
print (f"our MSE is {mse}")
print(f"rmse is {np.sqrt(mse)}")

mae = mean_absolute_error(train_case_y, y_pred)
print(f"mae is {mae}")

In [None]:
from sklearn.model_selection import train_test_split
n_by_date = df.groupby('Date')[["New Cases", "New Deaths"]].sum()

# only select certain data
test_case_x, train_case_x, test_case_y, train_case_y = train_test_split(n_by_date[["New Cases"]], n_by_date[["New Deaths"]], test_size=0.5, random_state=42)

# train a linear regression model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(test_case_x, test_case_y)

r_sq = model.score(test_case_x, test_case_y)
print("result for new cases and death everyday")
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}") # if we feed into the data of aceh this is the total infection 114734
print(f"slope: {model.coef_}") # when population increase, result increase by 2.47e+02 for pop density

y_pred = model.predict(train_case_x)
print(train_case_y)
# accuracy of model
from sklearn.metrics import r2_score, mean_squared_error
r2_score = r2_score(train_case_y, y_pred)
mse = mean_squared_error(train_case_y, y_pred, multioutput='uniform_average')

print (f"our accuracy score is {r2_score}")
print (f"our MSE is {mse}")
print(f"rmse is {np.sqrt(mse)}")

mae = mean_absolute_error(train_case_y, y_pred)
print(f"mae is {mae}")