In [70]:
import pandas as pd
import awswrangler as wr
import boto3

# Set AWS session

In [71]:
boto3.setup_default_session(profile_name='Fellow-permissions-S3-Sagemaker-586794458956')

# Define AWS file paths

In [72]:
bucket = "dssgx-munich-2024-bavarian-forest"
raw_data_folder = "raw-data"
preprocessed_data_folder = "preprocessed_data"

# Load data from AWS S3

In [73]:
def load_csv_files_from_aws_s3(path: str, **kwargs) -> pd.DataFrame:
    """Loads individual or multiple CSV files from an AWS S3 bucket.

    Args:
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the read_csv function.

    Returns:
        pd.DataFrame: The DataFrame containing the data from the CSV files.
    """

    df = wr.s3.read_csv(path=path, **kwargs)
    return df

historic_visitor_counts = load_csv_files_from_aws_s3(
    path=f"s3://{bucket}/{raw_data_folder}/hourly-historic-visitor-counts-all-sensors/",
    skiprows=2
)

historic_visitor_counts.head()

Unnamed: 0,Time,Bayerisch Eisenstein IN,Bayerisch Eisenstein OUT,Bayerisch Eisenstein Fußgänger IN,Bayerisch Eisenstein Fußgänger OUT,Bayerisch Eisenstein Fahrräder IN,Bayerisch Eisenstein Fahrräder OUT,Brechhäuslau IN,Brechhäuslau OUT,Brechhäuslau Fußgänger IN,...,Trinkwassertalsperre_MULTI Fahrräder OUT,Waldhausreibe IN,Waldhausreibe OUT,Waldhausreibe Channel 1 IN,Waldhausreibe Channel 2 OUT,Waldspielgelände_1 IN,Waldspielgelände_1 OUT,Wistlberg Fußgänger IN,Wistlberg Fußgänger OUT,Unnamed: 96
0,1. Jan. 2018 00:00,,,,,,,0.0,0.0,0.0,...,,407.0,205.0,,,,,,,
1,1. Jan. 2018 01:00,,,,,,,0.0,0.0,0.0,...,,0.0,0.0,,,,,,,
2,1. Jan. 2018 02:00,,,,,,,1.0,0.0,1.0,...,,0.0,0.0,,,,,,,
3,1. Jan. 2018 03:00,,,,,,,0.0,0.0,0.0,...,,0.0,0.0,,,,,,,
4,1. Jan. 2018 04:00,,,,,,,0.0,0.0,0.0,...,,0.0,0.0,,,,,,,


# Write data to AWS S3

In [74]:
def write_csv_file_to_aws_s3(df: pd.DataFrame, path: str, **kwargs) -> pd.DataFrame:
    """Writes an individual CSV file to AWS S3.

    Args:
        df (pd.DataFrame): The DataFrame to write.
        path (str): The path to the CSV files on AWS S3.
        **kwargs: Additional arguments to pass to the to_csv function.
    """

    wr.s3.to_csv(df, path=path, **kwargs)
    return

write_csv_file_to_aws_s3(
    df=historic_visitor_counts,
    path=f"s3://{bucket}/{preprocessed_data_folder}/all_historic_visitor_counts.csv",
)