In [None]:
import boto3

In [None]:
s3 = boto3.resource('s3')

for bucket in s3.buckets.all():
    print(bucket.name)

In [None]:
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.

import os # read dotenv values
BUCKET_NAME = os.environ.get("BUCKET_NAME")
FILE_NAME = os.environ.get("CSV_NAME")
print(BUCKET_NAME)
print(FILE_NAME)
# read the content of data bucket
bucket = s3.Bucket(BUCKET_NAME)


# list all files in this bucket
for obj in bucket.objects.all():
    print(obj.key)


In [None]:
# read csv from s3
import csv
from urllib.parse import urlparse
from io import BytesIO
import pandas as pd

def from_s3(s3_uri:str) -> pd.DataFrame:
    client = boto3.client("s3")
    parsed_s3 = urlparse(s3_uri)
    path= parsed_s3.path[1:]
    obj = client.get_object(Bucket=parsed_s3.netloc, Key=path)
    csv_in_bytes = BytesIO(obj["Body"].read())
    print(obj["Body"])
    return pd.read_csv(csv_in_bytes)


# read data from s3 bucket
data_location = f"s3://{BUCKET_NAME}/{FILE_NAME}"
df = from_s3 (data_location)



In [None]:
# initial data exploration to test if s3 is working
df.head(100)


In [None]:
df.info(show_counts=True)

In [None]:
pd.options.display.max_columns = df.shape[1]
print(df.shape)
df.describe()

In [None]:
cols_to_drop= ["City or Regency", "Time Zone", "Country", "Continent", "Province", "Location ISO Code", "Total Regencies", "Total Regencies", "Island", "Special Status", "Longitude", "Latitude", "Location Level", "Area (km2)"]
df = df.drop(cols_to_drop, axis=1)
print(df.shape)

In [None]:
# drop indonisa columns, 
indo_rows_to_drop = df.loc[df["Location"] == "Indonesia"]
indo_rows_to_drop.head() 
index_to_delete = indo_rows_to_drop.index
df.drop(index_to_delete, inplace=True)
df.head(20)

In [None]:
# clean total rural village for jakarta
# jakarta = df.loc[df["Location"] == "DKI Jakarta"]
# jakarta["Total Rural Villages"] = 0
# df.loc["Location","DKI Jakarta"] = jakarta

df.loc[df["Location"] == "DKI Jakarta", "Total Rural Villages"] = 0

df.head(20)


In [None]:
# clean percentage sign
col_with_percent = ["Case Fatality Rate", "Case Recovered Rate"]
print(df[col_with_percent].head())
df[col_with_percent] = df[col_with_percent].apply( lambda s: s.str.rstrip('%').astype(float) / 100.0)
df.head()


In [None]:
# clean na data
df.isna().sum()


In [None]:
# fill total city and total urban villages to 0 if there is nan
df ["Total Urban Villages"] = df["Total Urban Villages"].fillna(0)
df ["Total Cities"] = df ["Total Cities"].fillna(0)

# compare result
df.isna().sum()


In [None]:
# interporate missing growth factor data
df = df.interpolate(method ='linear', limit_direction ='forward')
df.loc[0, "Growth Factor of New Cases"]= 0
df.loc[0, "Growth Factor of New Deaths"]= 0
# round off 2 decimals
df["Growth Factor of New Cases"] = df["Growth Factor of New Cases"].apply(lambda x: round (x,3))
df["Growth Factor of New Deaths"] = df["Growth Factor of New Deaths"].apply(lambda x: round (x,3))
df.head(10)

In [None]:
# format the date
df["Date"] = pd.to_datetime(df["Date"])
df.head()

In [None]:
# upload to bucket
# save the cleaned csv locally
result_file_name: str = os.environ.get("CSV_CLEANED_NAME", "result")
path=f"{os.getcwd()}/{result_file_name}.csv"
print(path)
df.to_csv(path)

# upload csv to s3 bucket 
def to_s3(s3_uri:str, object_name:str = None) -> pd.DataFrame:
    client = boto3.client("s3")
    parsed_s3 = urlparse(s3_uri)
    file_name= parsed_s3.path[1:]
    if object_name is None:
        object_name = file_name
    obj = client.upload_file(file_name, Bucket=parsed_s3.netloc,Key=object_name)
    print(obj)


# read data from s3 bucket
upload_location = f"s3://{BUCKET_NAME}/{result_file_name}.csv"
to_s3 (upload_location)


In [None]:
# initial graphing of data
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
n_by_date = df.groupby('Date')[["Total Cases", "Total Deaths"]].sum()
n_by_date.head(10)

