In [25]:
import pandas as pd
import os

import boto3
import pandas as pd
from dotenv import load_dotenv


In [26]:
# Load environment variables
load_dotenv()

AWS_S3_BUCKET = os.getenv("AWS_S3_BUCKET")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")


In [27]:
def load_file_s3(object_key: str) -> pd.DataFrame:
    """Load a file from S3 and print its contents."""
    if not AWS_S3_BUCKET or not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
        raise ValueError(
            "AWS credentials or bucket name not set in environment variables."
        )

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )

    response = s3_client.get_object(Bucket=AWS_S3_BUCKET, Key=object_key)

    status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

    if status == 200:
        print(f"Successful S3 get_object response. Status - {status}")
        return pd.read_csv(response.get("Body"), dtype=str)
    raise ValueError(f"Unsuccessful S3 get_object response. Status - {status}")

def save_file_s3(df: pd.DataFrame, object_key: str) -> None:
    """Save a DataFrame to S3."""
    if not AWS_S3_BUCKET or not AWS_ACCESS_KEY_ID or not AWS_SECRET_ACCESS_KEY:
        raise ValueError(
            "AWS credentials or bucket name not set in environment variables."
        )

    s3_client = boto3.client(
        "s3",
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    )

    csv_buffer = df.to_csv()
    s3_client.put_object(Bucket=AWS_S3_BUCKET, Key=object_key, Body=csv_buffer)
    print(f"File saved to S3: {object_key}")


In [28]:
df = load_file_s3("raw/insee/20230823-communes-departement-region.csv")


Successful S3 get_object response. Status - 200


In [29]:
df.head()


Unnamed: 0,code_commune_INSEE,nom_commune_postal,code_postal,libelle_acheminement,ligne_5,latitude,longitude,code_commune,article,nom_commune,nom_commune_complet,code_departement,nom_departement,code_region,nom_region
0,1001,L ABERGEMENT CLEMENCIAT,1400,L ABERGEMENT CLEMENCIAT,,46.1534255214,4.92611354223,1,L',Abergement-Clémenciat,L'Abergement-Clémenciat,1,Ain,84,Auvergne-Rhône-Alpes
1,1002,L ABERGEMENT DE VAREY,1640,L ABERGEMENT DE VAREY,,46.0091878776,5.42801696363,2,L',Abergement-de-Varey,L'Abergement-de-Varey,1,Ain,84,Auvergne-Rhône-Alpes
2,1004,AMBERIEU EN BUGEY,1500,AMBERIEU EN BUGEY,,45.9608475114,5.3729257777,4,,Ambérieu-en-Bugey,Ambérieu-en-Bugey,1,Ain,84,Auvergne-Rhône-Alpes
3,1005,AMBERIEUX EN DOMBES,1330,AMBERIEUX EN DOMBES,,45.9961799872,4.91227250796,5,,Ambérieux-en-Dombes,Ambérieux-en-Dombes,1,Ain,84,Auvergne-Rhône-Alpes
4,1006,AMBLEON,1300,AMBLEON,,45.7494989044,5.59432017366,6,,Ambléon,Ambléon,1,Ain,84,Auvergne-Rhône-Alpes


In [30]:
df['code_commune_INSEE'] = df['code_commune_INSEE'].str.zfill(5)
df['code_postal'] = df['code_postal'].str.zfill(5)
 
df = df.drop(columns=[
    "nom_commune_postal",
    "libelle_acheminement",
    "ligne_5",
    "code_commune",
    "article",
    "nom_commune",
    "code_departement",
])

code_to_delete = ['1', '2', '3', '4', '6']
df = df[~df['code_region'].isin(code_to_delete)]

df = df[df['code_commune_INSEE'] <= '95690']

save_file_s3(df, "processed/referentiel/ref_espace_communes.csv")
print("File saved : ref_espace_communes.csv")
print(df[['code_commune_INSEE', 'code_postal', 'code_region']].tail(20))


File saved to S3: processed/referentiel/ref_espace_communes.csv
File saved : ref_espace_communes.csv
      code_commune_INSEE code_postal code_region
38705              95610       95450          11
38706              95611       95810          11
38707              95612       95500          11
38708              95625       95450          11
38709              95627       95810          11
38710              95628       95760          11
38711              95633       95500          11
38712              95637       95490          11
38713              95641       95470          11
38714              95651       95510          11
38715              95652       95270          11
38716              95656       95510          11
38717              95658       95450          11
38718              95660       95570          11
38719              95675       95380          11
38720              95676       95510          11
38721              95678       95840          11
38722            