# Creation denormalized table

In [3]:
import pandas as pd

# Load all datasets
df_basilic = pd.read_csv("../data/processed/basilic_insee_norm.csv")
df_dept = pd.read_csv("../data/processed/departments_for_sql.csv")
df_communes = pd.read_csv("../data/processed/communes_for_sql.csv")
df_area = pd.read_csv("../data/processed/web_scraping/departments_area_wiki.csv")

# Get department population
dept_pop = df_communes.groupby('dept_code', as_index=False).agg({
    'population': 'sum'
})

# Merge BASILIC with departments
bq_table = df_basilic.merge(df_dept[['dept_code', 'dept_name', 'region_code']], 
                             on='dept_code', how='left')

# Merge with population
bq_table = bq_table.merge(dept_pop[['dept_code', 'population']], 
                          on='dept_code', how='left')

# Merge with area
bq_table = bq_table.merge(df_area[['dept_code', 'area_km2']], 
                          on='dept_code', how='left')

# Add date column (for partitioning)
from datetime import datetime
bq_table['load_date'] = datetime.now().strftime('%Y-%m-%d')

# Select important columns
columns_to_keep = [
    'id', 'nom', 'type_equipement_ou_lieu', 'domaine', 'sous_domaine',
    'dept_code', 'dept_name', 'region_code', 'region',
    'latitude', 'longitude', 'code_postal',
    'population', 'area_km2', 'load_date'
]

bq_table = bq_table[columns_to_keep]

print(f"Denormalized table:")
print(f"   Rows: {len(bq_table):,}")
print(f"   Columns: {len(bq_table.columns)}")



  df_basilic = pd.read_csv("../data/processed/basilic_insee_norm.csv")


Denormalized table:
   Rows: 88,025
   Columns: 15


In [5]:
# Export for BigQuery
output_path = "../data/processed/bigquery/cultural_loom_analytics.csv"
bq_table.to_csv(output_path, index=False, encoding='utf-8')

print(f"\n EXPORTED: {output_path}")
print(f"   Size: {bq_table.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")


 EXPORTED: ../data/processed/bigquery/cultural_loom_analytics.csv
   Size: 46.7 MB


In [10]:
import pandas as pd
from datetime import datetime

# Load data
df_basilic = pd.read_csv("../data/processed/basilic_insee_.csv")
df_communes = pd.read_csv("../data/processed/communes_for_sql.csv")
df_area = pd.read_csv("../data/processed/web_scraping/departments_area_wiki.csv")

# Aggregate population
dept_pop = df_communes.groupby('dept_code', as_index=False).agg({'population': 'sum'})

# Merge
bq_table = df_basilic.merge(dept_pop, on='dept_code', how='left')
bq_table = bq_table.merge(df_area, on='dept_code', how='left')

# Add date
bq_table['load_date'] = '2026-02-11'

# SIMPLE COLUMNS ONLY
simple_table = bq_table[[
    'id', 'nom', 'type_equipement_ou_lieu', 'domaine',
    'dept_code', 'region', 'latitude', 'longitude',
    'population', 'area_km2', 'load_date'
]].copy()

# CLEAN - Replace problematic characters
simple_table['nom'] = simple_table['nom'].astype(str).str.replace('"', '').str.replace("'", '')
simple_table['type_equipement_ou_lieu'] = simple_table['type_equipement_ou_lieu'].astype(str)
simple_table['domaine'] = simple_table['domaine'].astype(str)
simple_table['region'] = simple_table['region'].astype(str)

# Fill NaN with empty string or 0
simple_table['nom'] = simple_table['nom'].fillna('')
simple_table['type_equipement_ou_lieu'] = simple_table['type_equipement_ou_lieu'].fillna('Unknown')
simple_table['domaine'] = simple_table['domaine'].fillna('Unknown')
simple_table['region'] = simple_table['region'].fillna('Unknown')
simple_table['latitude'] = simple_table['latitude'].fillna(0)
simple_table['longitude'] = simple_table['longitude'].fillna(0)
simple_table['population'] = simple_table['population'].fillna(0)
simple_table['area_km2'] = simple_table['area_km2'].fillna(0)

# Export
simple_table.to_csv("../data/processed/bq_clean.csv", index=False, encoding='utf-8')

print(f"CLEAN CSV: {len(simple_table):,} rows")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/basilic_insee_.csv'