In [11]:
import sys
import os
import pandas as pd
from psycopg2 import sql
from sqlalchemy import create_engine

In [2]:
# Add the 'src' directory to the PYTHONPATH
sys.path.append(os.path.abspath('../source'))

from connection_db.db_utils import get_connection

conn = get_connection()


Conexión exitosa


In [3]:
def create_tables():
    conn = get_connection()
    cur = conn.cursor()
 
    query = sql.SQL("""
        CREATE TABLE IF NOT EXISTS accidents (
            id SERIAL PRIMARY KEY,
            country VARCHAR(100) NOT NULL,
            year INT NOT NULL,
            month VARCHAR(100) NOT NULL,
            day_of_week VARCHAR(100) NOT NULL,
            time_of_day VARCHAR(100) NOT NULL,
            urban_rural VARCHAR(50) NOT NULL,
            road_type VARCHAR(100) NOT NULL,
            weather_conditions VARCHAR(100) NOT NULL,
            visibility_level FLOAT,
            number_of_vehicles_involved INT,
            speed_limit INT,
            driver_age_group VARCHAR(50),
            driver_gender VARCHAR(50) NOT NULL,
            driver_alcohol_level FLOAT,
            driver_fatigue BOOLEAN,
            vehicle_condition VARCHAR(100) NOT NULL,
            pedestrians_involved INT,
            cyclists_involved INT,
            accident_severity VARCHAR(100) NOT NULL,
            number_of_injuries INT,
            number_of_fatalities INT,
            emergency_response_time FLOAT,
            traffic_volume FLOAT,
            road_condition VARCHAR(100) NOT NULL,
            accident_cause VARCHAR(100) NOT NULL,
            insurance_claims INT,
            medical_cost FLOAT,
            economic_loss FLOAT,
            region VARCHAR(100) NOT NULL,
            population_density FLOAT
        );
    """)


    cur.execute(query)
    conn.commit()

    cur.close()
    conn.close()

    print("Tabla 'accidents' creada correctamente.")

# Llamar la función
create_tables()

Conexión exitosa
Tabla 'accidents' creada correctamente.


In [4]:
data= pd.read_csv('../data/road_accident_dataset.csv')

data.head(n=5)


Unnamed: 0,Country,Year,Month,Day of Week,Time of Day,Urban/Rural,Road Type,Weather Conditions,Visibility Level,Number of Vehicles Involved,...,Number of Fatalities,Emergency Response Time,Traffic Volume,Road Condition,Accident Cause,Insurance Claims,Medical Cost,Economic Loss,Region,Population Density
0,USA,2002,October,Tuesday,Evening,Rural,Street,Windy,220.414651,1,...,2,58.62572,7412.75276,Wet,Weather,4,40499.856982,22072.878502,Europe,3866.273014
1,UK,2014,December,Saturday,Evening,Urban,Street,Windy,168.311358,3,...,1,58.04138,4458.62882,Snow-covered,Mechanical Failure,3,6486.600073,9534.399441,North America,2333.916224
2,USA,2012,July,Sunday,Afternoon,Urban,Highway,Snowy,341.286506,4,...,4,42.374452,9856.915064,Wet,Speeding,4,29164.412982,58009.145124,South America,4408.889129
3,UK,2017,May,Saturday,Evening,Urban,Main Road,Clear,489.384536,2,...,3,48.554014,4958.646267,Icy,Distracted Driving,3,25797.212566,20907.151302,Australia,2810.822423
4,Canada,2002,July,Tuesday,Afternoon,Rural,Highway,Rainy,348.34485,1,...,4,18.31825,3843.191463,Icy,Distracted Driving,8,15605.293921,13584.060759,South America,3883.645634


In [34]:
data["driver_fatigue"] = data["driver_fatigue"].astype(bool)

print(data["driver_fatigue"].head())


0    False
1     True
2    False
3     True
4     True
Name: driver_fatigue, dtype: bool


In [35]:
def get_engine():
    """Creates a SQLAlchemy engine for PostgreSQL connection."""
    return create_engine(
        f"postgresql://{os.getenv('PG_USER')}:{os.getenv('PG_PASSWORD')}@{os.getenv('PG_HOST')}:{os.getenv('PG_PORT')}/{os.getenv('PG_DATABASE')}"
    )

In [None]:

def insert_data(df: pd.DataFrame):
    """Inserts a given DataFrame into the PostgreSQL 'accidents' table."""
    try:
        # Column mapping
        column_mapping = {
            "Country": "country",
            "Year": "year",
            "Month": "month",
            "Day of Week": "day_of_week",
            "Time of Day": "time_of_day",
            "Urban/Rural": "urban_rural",
            "Road Type": "road_type",
            "Weather Conditions": "weather_conditions",
            "Visibility Level": "visibility_level",
            "Number of Vehicles Involved": "number_of_vehicles_involved",
            "Speed Limit": "speed_limit",
            "Driver Age Group": "driver_age_group",
            "Driver Gender": "driver_gender",
            "Driver Alcohol Level": "driver_alcohol_level",
            "Driver Fatigue": "driver_fatigue",
            "Vehicle Condition": "vehicle_condition",
            "Pedestrians Involved": "pedestrians_involved",
            "Cyclists Involved": "cyclists_involved",
            "Accident Severity": "accident_severity",
            "Number of Injuries": "number_of_injuries",
            "Number of Fatalities": "number_of_fatalities",
            "Emergency Response Time": "emergency_response_time",
            "Traffic Volume": "traffic_volume",
            "Road Condition": "road_condition",
            "Accident Cause": "accident_cause",
            "Insurance Claims": "insurance_claims",
            "Medical Cost": "medical_cost",
            "Economic Loss": "economic_loss",
            "Region": "region",
            "Population Density": "population_density"
        }
        
        # Rename columns to match DB schema
        df.rename(columns=column_mapping, inplace=True)

        # Replace NaN values with None (for PostgreSQL compatibility)
        df = df.where(pd.notna(df), None)

        # Insert data
        engine = get_engine()
        df.to_sql(name="accidents", con=engine, if_exists='append', index=False)

        print(f"{len(df)} registros insertados.")

    except Exception as e:
        print(f"Error al insertar datos: {e}")


insert_data(data)


132000 registros insertados.
