In [63]:
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import glob
import boto3
import itertools


In [64]:
# Load environment variables
load_dotenv()

AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")


In [65]:
ROOT_DIR = "../data/"

def load_file_s3(object_key: str) -> pd.DataFrame:
    """Load a file from S3 and print its contents."""
    if not AWS_S3_BUCKET or not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
        raise ValueError(
            "AWS credentials or bucket name not set in environment variables."
        )

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )

    # check if the object already exists locally don't download it again
    if os.path.exists(f"{ROOT_DIR}{object_key}"):
        print(
            f"File {f'{ROOT_DIR}{object_key}'} already exists locally. Loading from local file."
        )
        return pd.read_csv(f"{ROOT_DIR}{object_key}")
    print(f"Downloading {f'{ROOT_DIR}{object_key}'} from S3 bucket {AWS_S3_BUCKET}.")

    response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=object_key)
    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 get_object response. Status - {status}")
        # save the file locally and create the directory if it doesn't exist
        os.makedirs(os.path.dirname(f"{ROOT_DIR}{object_key}"), exist_ok=True)
        # write the file to the local filesystem
        print(f"Saving {ROOT_DIR}{object_key} locally.")
        with open(f"{ROOT_DIR}{object_key}", "wb") as f:
            f.write(response["Body"].read())
        print(f"File {ROOT_DIR}{object_key} downloaded and saved locally.")
        # read the file into a DataFrame
        return pd.read_csv(f"{ROOT_DIR}{object_key}")
    raise ValueError(f"Unsuccessful S3 get_object response. Status - {status}")


In [66]:
df = load_file_s3("processed/all risk score merged/data_climate_danger_final.csv")


Downloading ../data/processed/all risk score merged/data_climate_danger_final.csv from S3 bucket oasis-prd-001.
Successful S3 get_object response. Status - 200
Saving ../data/processed/all risk score merged/data_climate_danger_final.csv locally.
File ../data/processed/all risk score merged/data_climate_danger_final.csv downloaded and saved locally.


In [67]:
df.head()


Unnamed: 0,code_commune_INSEE,nom_departement,code_region,department,year,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
0,1001,Ain,84,1,2014,1.25,1.25,1.775,0.95,0.0,0.0,1.045
1,1001,Ain,84,1,2015,1.756098,1.756098,2.0,0.829268,0.0,0.0,1.268293
2,1001,Ain,84,1,2016,1.390244,1.390244,1.658537,0.878049,0.0,0.0,1.063415
3,1001,Ain,84,1,2017,1.414634,1.414634,1.536585,0.829268,0.0,0.0,1.039024
4,1001,Ain,84,1,2018,1.55,1.55,1.525,1.025,0.0,0.0,1.13


In [68]:
df.rename(columns={
    "code_commune_INSEE": "code_commune_insee",
}, inplace=True)
df.rename(
    columns={
        "year": "annee",
        "department": "code_departement",
    },
    inplace=True,
)
df['code_commune_insee'] = df['code_commune_insee'].astype(str).str.zfill(5)
df['code_departement'] = df['code_departement'].astype(str).str.zfill(2)
df.drop(columns=["nom_departement", "code_region"], inplace=True)
df.head()


Unnamed: 0,code_commune_insee,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
0,1001,1,2014,1.25,1.25,1.775,0.95,0.0,0.0,1.045
1,1001,1,2015,1.756098,1.756098,2.0,0.829268,0.0,0.0,1.268293
2,1001,1,2016,1.390244,1.390244,1.658537,0.878049,0.0,0.0,1.063415
3,1001,1,2017,1.414634,1.414634,1.536585,0.829268,0.0,0.0,1.039024
4,1001,1,2018,1.55,1.55,1.525,1.025,0.0,0.0,1.13


In [69]:
import io
s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
)
dataset_housing_departement_prices_full_key = (
    "processed/risk-scores/risk-scores-final.csv.gz"
)
# 1. Create an in-memory binary buffer
csv_buffer = io.BytesIO()

# 2. Write the DataFrame to the buffer as a gzipped CSV
# Pandas' to_csv with compression='gzip' can write directly to a BytesIO object
df.to_csv(csv_buffer, index=False, compression="gzip")

# 3. Important: Rewind the buffer to the beginning
# This ensures that S3 reads the entire content from the start.
csv_buffer.seek(0)

# 4. Upload the buffer's content to S3
s3_client.put_object(
    Bucket=AWS_S3_BUCKET,
    Key=dataset_housing_departement_prices_full_key,
    Body=csv_buffer,  # Pass the BytesIO object directly as Body
    ContentType="application/gzip",  # Correct Content-Type for gzipped data
)


{'ResponseMetadata': {'RequestId': '73HZ0JDWYM1MXG3E',
  'HostId': 'FcqjBi/HmsGizIA5hJYv6bKMNNwGpg8O7fVic+YrS1Vhe8J9+cOs8ajpV2Zy9nTKw5pEvFnR/Ko=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'FcqjBi/HmsGizIA5hJYv6bKMNNwGpg8O7fVic+YrS1Vhe8J9+cOs8ajpV2Zy9nTKw5pEvFnR/Ko=',
   'x-amz-request-id': '73HZ0JDWYM1MXG3E',
   'date': 'Wed, 30 Jul 2025 14:21:55 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"aa5a4e02573e91ed7a2624657e8d94f0"',
   'x-amz-checksum-crc32': '2sJ0RA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'ETag': '"aa5a4e02573e91ed7a2624657e8d94f0"',
 'ChecksumCRC32': '2sJ0RA==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}

In [70]:
!ls -lha ../data/processed/all\ risk\ score\ merged/


total 109M
drwxr-xr-x 2 nick nick 4.0K Jul 30 16:21 .
drwxr-xr-x 4 nick nick 4.0K Jul 30 16:21 ..
-rw-r--r-- 1 nick nick  70M Jul 28 16:51 data_climate_danger.csv
-rw-r--r-- 1 nick nick  40M Jul 30 16:21 data_climate_danger_final.csv


In [71]:
# load gz file from S3
df = load_file_s3("processed/risk-scores/risk-scores-final.csv.gz")


Downloading ../data/processed/risk-scores/risk-scores-final.csv.gz from S3 bucket oasis-prd-001.
Successful S3 get_object response. Status - 200
Saving ../data/processed/risk-scores/risk-scores-final.csv.gz locally.
File ../data/processed/risk-scores/risk-scores-final.csv.gz downloaded and saved locally.


In [72]:
df


Unnamed: 0,code_commune_insee,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
0,1001,1,2014,1.250000,1.250000,1.775000,0.950000,0.0,0.0,1.045000
1,1001,1,2015,1.756098,1.756098,2.000000,0.829268,0.0,0.0,1.268293
2,1001,1,2016,1.390244,1.390244,1.658537,0.878049,0.0,0.0,1.063415
3,1001,1,2017,1.414634,1.414634,1.536585,0.829268,0.0,0.0,1.039024
4,1001,1,2018,1.550000,1.550000,1.525000,1.025000,0.0,0.0,1.130000
...,...,...,...,...,...,...,...,...,...,...
389769,95690,95,2020,2.000000,2.000000,1.000000,1.000000,0.0,0.0,1.200000
389770,95690,95,2021,1.000000,1.000000,2.000000,1.000000,0.0,0.0,1.000000
389771,95690,95,2022,2.000000,2.000000,1.000000,1.000000,0.0,0.0,1.200000
389772,95690,95,2023,1.000000,1.000000,1.000000,1.000000,0.0,0.0,0.800000


In [73]:
!ls -lha ../data/processed/risk-scores/


total 1.3M
drwxr-xr-x 2 nick nick 4.0K Jul 30 16:21 .
drwxr-xr-x 5 nick nick 4.0K Jul 30 16:21 ..
-rw-r--r-- 1 nick nick 1.3M Jul 30 16:21 risk-scores-final.csv.gz


In [74]:
df["code_commune_insee"] = df["code_commune_insee"].astype(str).str.zfill(5)
df["code_departement"] = df["code_departement"].astype(str).str.zfill(2)
df


Unnamed: 0,code_commune_insee,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
0,01001,01,2014,1.250000,1.250000,1.775000,0.950000,0.0,0.0,1.045000
1,01001,01,2015,1.756098,1.756098,2.000000,0.829268,0.0,0.0,1.268293
2,01001,01,2016,1.390244,1.390244,1.658537,0.878049,0.0,0.0,1.063415
3,01001,01,2017,1.414634,1.414634,1.536585,0.829268,0.0,0.0,1.039024
4,01001,01,2018,1.550000,1.550000,1.525000,1.025000,0.0,0.0,1.130000
...,...,...,...,...,...,...,...,...,...,...
389769,95690,95,2020,2.000000,2.000000,1.000000,1.000000,0.0,0.0,1.200000
389770,95690,95,2021,1.000000,1.000000,2.000000,1.000000,0.0,0.0,1.000000
389771,95690,95,2022,2.000000,2.000000,1.000000,1.000000,0.0,0.0,1.200000
389772,95690,95,2023,1.000000,1.000000,1.000000,1.000000,0.0,0.0,0.800000


In [75]:
df[(df["code_departement"] == "06") & (df["fire_score"] > 0)]


Unnamed: 0,code_commune_insee,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score


In [76]:
df[(df["fire_score"] > 0)]


Unnamed: 0,code_commune_insee,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
39844,11001,11,2016,1.595238,1.595238,1.357143,0.928571,1.0,0.0,1.295238
39894,11005,11,2022,2.060606,2.060606,1.393939,0.878788,1.0,0.0,1.478788
39901,11006,11,2018,1.717949,1.717949,1.794872,1.051282,1.0,0.0,1.456410
39913,11007,11,2019,1.743590,1.743590,1.307692,0.974359,3.0,0.0,1.753846
39923,11008,11,2018,1.717949,1.717949,1.794872,1.051282,1.0,0.0,1.456410
...,...,...,...,...,...,...,...,...,...,...
385614,91461,91,2023,1.500000,1.500000,1.000000,0.833333,1.0,0.0,1.166667
385647,91469,91,2023,1.500000,1.500000,1.000000,0.833333,1.0,0.0,1.166667
386021,91581,91,2023,1.500000,1.500000,1.000000,0.833333,1.0,3.2,1.166667
386032,91587,91,2023,1.500000,1.500000,1.000000,0.833333,1.0,3.2,1.166667


In [77]:
dataset_departements = (
    df.groupby(["code_departement", "annee"])[
        [
            "drought_score",
            "heat_score",
            "rainfall_score",
            "extrem_events_score",
            "fire_score",
            "ATMO_risk",
            "avg_risk_score",
        ]
    ]
    .mean()
    .reset_index()
)
dataset_departements


Unnamed: 0,code_departement,annee,drought_score,heat_score,rainfall_score,extrem_events_score,fire_score,ATMO_risk,avg_risk_score
0,01,2014,1.250000,1.250000,1.775000,0.950000,0.000000,0.000000,1.045000
1,01,2015,1.756098,1.756098,2.000000,0.829268,0.000000,0.000000,1.268293
2,01,2016,1.390244,1.390244,1.658537,0.878049,0.000000,0.000000,1.063415
3,01,2017,1.414634,1.414634,1.536585,0.829268,0.000000,0.000000,1.039024
4,01,2018,1.550000,1.550000,1.525000,1.025000,0.000000,0.000000,1.130000
...,...,...,...,...,...,...,...,...,...
1029,95,2020,1.777778,1.777778,1.222222,0.777778,0.000000,0.000000,1.111111
1030,95,2021,1.000000,1.000000,1.285714,0.857143,0.000000,0.882162,0.828571
1031,95,2022,2.000000,2.000000,1.000000,0.833333,0.005405,0.882162,1.167748
1032,95,2023,1.166667,1.166667,1.000000,0.833333,0.000000,0.882162,0.833333


In [78]:
dataset_housing_departement_prices_full_key = "processed/risk-scores/risk-scores-departements-final.csv.gz"
# 1. Create an in-memory binary buffer
csv_buffer = io.BytesIO()

# 2. Write the DataFrame to the buffer as a gzipped CSV
# Pandas' to_csv with compression='gzip' can write directly to a BytesIO object
dataset_departements.to_csv(csv_buffer, index=False, compression="gzip")

# 3. Important: Rewind the buffer to the beginning
# This ensures that S3 reads the entire content from the start.
csv_buffer.seek(0)

# 4. Upload the buffer's content to S3
s3_client.put_object(
    Bucket=AWS_S3_BUCKET,
    Key=dataset_housing_departement_prices_full_key,
    Body=csv_buffer,  # Pass the BytesIO object directly as Body
    ContentType="application/gzip",  # Correct Content-Type for gzipped data
)


{'ResponseMetadata': {'RequestId': 'SMF2Z10Q5XYW95JE',
  'HostId': 'mZncNTEOhcK/PNcPEEJJztuF5O106jAs3seA3BY0c4tH7n0CnGx9OJginN9fFyIX1tPQQ8P4wYw=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'mZncNTEOhcK/PNcPEEJJztuF5O106jAs3seA3BY0c4tH7n0CnGx9OJginN9fFyIX1tPQQ8P4wYw=',
   'x-amz-request-id': 'SMF2Z10Q5XYW95JE',
   'date': 'Wed, 30 Jul 2025 14:21:58 GMT',
   'x-amz-server-side-encryption': 'AES256',
   'etag': '"5f68ddddaa88ba04b74e48fb8a4e4e61"',
   'x-amz-checksum-crc32': 'AhAXHA==',
   'x-amz-checksum-type': 'FULL_OBJECT',
   'content-length': '0',
   'server': 'AmazonS3'},
  'RetryAttempts': 0},
 'ETag': '"5f68ddddaa88ba04b74e48fb8a4e4e61"',
 'ChecksumCRC32': 'AhAXHA==',
 'ChecksumType': 'FULL_OBJECT',
 'ServerSideEncryption': 'AES256'}