In [None]:
import os
import sqlalchemy
from sqlalchemy import create_engine

# ----------------------------------------------------------------------------
# Database engine configuration
# ----------------------------------------------------------------------------
# Update the connection string below to match your local or remote Postgres instance.
from pydotenv import Environment
env = Environment('.env')

# Current local test engine (edit as needed):
engine = create_engine(f'postgresql://{env.get('DB_USER')}:{env.get('DB_PASS')}@{env.get('DB_HOST')}:{env.get('DB_PORT')}/{env.get('DB_NAME')}')
engine  # display engine info to confirm connection string format

In [3]:
from typing import List, Optional
from sqlalchemy import ForeignKey, String
from sqlalchemy.orm import DeclarativeBase, Mapped, mapped_column, relationship

class Base(DeclarativeBase):
    pass

In [4]:
class Cars45(Base):
    __tablename__ = "cars45"

    car_id: Mapped[str] = mapped_column(primary_key=True)
    description: Mapped[str]
    amount: Mapped[int]
    region: Mapped[str]
    make: Mapped[str]
    model: Mapped[str]
    year_of_man: Mapped[int]
    color: Mapped[str]
    condition: Mapped[str]
    mileage: Mapped[int]
    engine_size: Mapped[int]
    selling_cond: Mapped[str]
    bought_cond: Mapped[str]
    trim: Mapped[str]
    drive_train: Mapped[str]
    reg_city: Mapped[str]
    seat: Mapped[str]
    num_cylinder: Mapped[int]
    horse_power: Mapped[int]
    body_build: Mapped[str]
    fuel_type: Mapped[str]
    transmission: Mapped[str]



In [6]:
import pandas as pd

df = pd.read_csv("./sources/car45_data.csv", index_col=0)

In [7]:
df.head()

Unnamed: 0,car_id,description,amount,region,make,model,year_of_man,color,condition,mileage,...,bought_cond,trim,drive_train,reg_city,seat,num_cylinder,horse_power,body_build,fuel_type,transmission
0,5IQTDBTYmvK1tJwhdvGJfESJ,Lexus ES 350 FWD 2013 Red,12937500,"Lagos State, Ikeja",Lexus,ES,2013,Red,Foreign Used,272474.0,...,Imported,350 FWD,Front Wheel,,5.0,6.0,268.0,Sedan,Petrol,Automatic
1,zpZUGomoVXuKk9UFa8j8moC9,Land Rover Range Rover 2012 White,6750000,"Abuja (FCT), Garki 2",Land Rover,Range Rover,2012,White,Nigerian Used,102281.0,...,Registered,,,,,,,SUV,Petrol,Automatic
2,a6ShZXOX4KtY6IBGJIcF3Cxk,Toyota Sequoia 2018 Black,50625000,"Lagos State, Lekki",Toyota,Sequoia,2018,Black,Foreign Used,127390.0,...,Imported,,,,,,,SUV,Petrol,Automatic
3,CciPNDN6vhhQQI1FTQHAbfxi,Toyota Corolla 2007 Green,3600000,"Abuja (FCT), Lugbe District",Toyota,Corolla,2007,Green,Nigerian Used,139680.0,...,Registered,,,ABUJA,,,,,Petrol,Automatic
4,bvwd5LDMx6mIYpVa6Uhi2jqJ,Mercedes-Benz M Class 2005 Silver,3262500,"Lagos State, Isolo",Mercedes-Benz,M Class,2005,Silver,Nigerian Used,220615.0,...,Imported,,,ABUJA,,,,,Petrol,Automatic


In [70]:
string_columns = [col for col in df.columns if df.dtypes[col] == 'O']
string_columns.remove('seat')
df[string_columns] = df[string_columns].fillna('unknown')

df[string_columns].sample(5)

Unnamed: 0,car_id,description,region,make,model,color,condition,selling_cond,bought_cond,trim,drive_train,reg_city,body_build,fuel_type,transmission
250,cLu7fU4qZvkR4YiBFkc7d3Mw,Lexus RX 350 2011 Black,"Lagos State, Lekki",Lexus,RX 350,Black,Foreign Used,Imported,Imported,unknown,unknown,unknown,SUV,Petrol,Automatic
3828,fwi9j28u4b3n4FgLyMwiPXX5,Mazda CX-9 2009 Black,"Lagos State, Amuwo-Odofin",Mazda,CX-9,Black,Foreign Used,Imported,Imported,unknown,unknown,unknown,SUV,Petrol,Automatic
3044,anVA3v42vCT63158NuNwShQU,Toyota Sienna 2014 Blue,"Ondo State, Akure",Toyota,Sienna,Blue,Foreign Used,Imported,Imported,LE 7-Passenger 4dr Minivan AWD (3.5L 6cyl 6A),All Wheel,unknown,Minivan,Petrol,Automatic
907,Cp3QOLqL8obO1eDcAhZ3wEBD,Audi A4 2006 Blue,"Lagos State, Ajah",Audi,A4,Blue,Nigerian Used,Registered,Registered,unknown,unknown,Lagos,unknown,Petrol,Automatic
3497,8Z4fcPTP321ZuYCfknUOQYd0,Toyota Camry 2009 Green,"Oyo State, Ibadan",Toyota,Camry,Green,Foreign Used,Imported,Imported,unknown,unknown,unknown,unknown,Petrol,Automatic


In [71]:
num_columns = list(set(df.columns) - {col for col in df.columns if df.dtypes[col] == 'O'})
df[num_columns] = df[num_columns].fillna(0)

df[num_columns].sample(5)

Unnamed: 0,engine_size,year_of_man,mileage,num_cylinder,horse_power,amount
1001,2200.0,2005,176791.0,0.0,0.0,2520000
971,1800.0,2010,139161.0,0.0,0.0,4160000
3972,2200.0,1999,275680.0,0.0,0.0,1092000
1217,3500.0,2007,121586.0,0.0,0.0,4784000
3227,3500.0,2008,106331.0,6.0,0.0,4368000


In [72]:
df.seat.fillna("0", inplace=True)
df.seat.sample(5)

3264    0
709     0
152     0
692     0
988     5
Name: seat, dtype: object

In [47]:
df.describe()

Unnamed: 0,amount,year_of_man,mileage,engine_size,num_cylinder,horse_power
count,2894.0,2894.0,2829.0,2846.0,369.0,338.0
mean,4986147.0,2007.782654,243347.5,3079.524596,5.411924,216.816568
std,6177222.0,4.565697,1416721.0,3753.377371,1.301533,69.989033
min,577500.0,1988.0,0.0,25.0,4.0,67.0
25%,2166000.0,2005.0,130059.0,2300.0,4.0,158.0
50%,3203125.0,2007.0,191398.0,3000.0,6.0,225.0
75%,5250000.0,2011.0,265988.0,3500.0,6.0,269.75
max,98700000.0,2023.0,74026750.0,158713.0,8.0,477.0


In [15]:
Base.metadata.create_all(bind=engine)

2025-10-04 00:16:16,801 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2025-10-04 00:16:16,802 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-04 00:16:16,807 INFO sqlalchemy.engine.Engine select current_schema()
2025-10-04 00:16:16,808 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-04 00:16:16,814 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2025-10-04 00:16:16,816 INFO sqlalchemy.engine.Engine [raw sql] {}
2025-10-04 00:16:16,820 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-04 00:16:16,829 INFO sqlalchemy.engine.Engine SELECT pg_catalog.pg_class.relname 
FROM pg_catalog.pg_class JOIN pg_catalog.pg_namespace ON pg_catalog.pg_namespace.oid = pg_catalog.pg_class.relnamespace 
WHERE pg_catalog.pg_class.relname = %(table_name)s AND pg_catalog.pg_class.relkind = ANY (ARRAY[%(param_1)s, %(param_2)s, %(param_3)s, %(param_4)s, %(param_5)s]) AND pg_catalog.pg_table_is_visible(pg_catalog.pg_class.oid) AND pg_catalog.pg_namespace.nspname != %(nspname

In [73]:
df.to_csv('./clean_cars45.csv', index=False)

In [77]:
from csv import DictReader
import csv

store = []

with open("./clean_cars45.csv", newline='') as f:
    # print(f.read())
    source = csv.DictReader(f)
    for record in source:
        store.append(record)

In [78]:
store

[{'car_id': '5IQTDBTYmvK1tJwhdvGJfESJ',
  'description': 'Lexus ES 350 FWD 2013 Red',
  'amount': '12937500',
  'region': 'Lagos State, Ikeja',
  'make': 'Lexus',
  'model': 'ES',
  'year_of_man': '2013',
  'color': 'Red',
  'condition': 'Foreign Used',
  'mileage': '272474.0',
  'engine_size': '3500.0',
  'selling_cond': 'Imported',
  'bought_cond': 'Imported',
  'trim': '350 FWD',
  'drive_train': 'Front Wheel',
  'reg_city': 'unknown',
  'seat': '5',
  'num_cylinder': '6.0',
  'horse_power': '268.0',
  'body_build': 'Sedan',
  'fuel_type': 'Petrol',
  'transmission': 'Automatic'},
 {'car_id': 'zpZUGomoVXuKk9UFa8j8moC9',
  'description': 'Land Rover Range Rover 2012 White',
  'amount': '6750000',
  'region': 'Abuja (FCT), Garki 2',
  'make': 'Land Rover',
  'model': 'Range Rover',
  'year_of_man': '2012',
  'color': 'White',
  'condition': 'Nigerian Used',
  'mileage': '102281.0',
  'engine_size': '5000.0',
  'selling_cond': 'Registered',
  'bought_cond': 'Registered',
  'trim': 'unk

In [84]:
from sqlalchemy.orm import Session

def clean_and_cast(record):
    """Convert numeric strings to int while preserving 'unknown' values."""
    def to_int_or_unknown(value):
        # Keep 'unknown' literal as-is
        if value in ("", None):
            return None
        if str(value).lower() == "unknown":
            return "unknown"
        try:
            # Convert "272474.0" â†’ 272474
            return int(float(value))
        except ValueError:
            return "unknown"

    # Numeric fields (cast or preserve 'unknown')
    record["amount"] = to_int_or_unknown(record.get("amount"))
    record["year_of_man"] = to_int_or_unknown(record.get("year_of_man"))
    record["mileage"] = to_int_or_unknown(record.get("mileage"))
    record["engine_size"] = to_int_or_unknown(record.get("engine_size"))
    record["num_cylinder"] = to_int_or_unknown(record.get("num_cylinder"))
    record["horse_power"] = to_int_or_unknown(record.get("horse_power"))

    # Leave other text fields untouched â€” they already have 'unknown'
    return record


with Session(engine) as sess:
    sess.bulk_save_objects([Cars45(**clean_and_cast(record)) for record in store])
    sess.commit()

2025-10-04 11:12:47,312 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2025-10-04 11:12:47,455 INFO sqlalchemy.engine.Engine INSERT INTO cars45 (car_id, description, amount, region, make, model, year_of_man, color, condition, mileage, engine_size, selling_cond, bought_cond, trim, drive_train, reg_city, seat, num_cylinder, horse_power, body_build, fuel_type, transmission) V ... 426484 characters truncated ... linder__999)s, %(horse_power__999)s, %(body_build__999)s, %(fuel_type__999)s, %(transmission__999)s)
2025-10-04 11:12:47,456 INFO sqlalchemy.engine.Engine [cached since 755.8s ago (insertmanyvalues) 1/3 (unordered)] {'selling_cond__0': 'Imported', 'fuel_type__0': 'Petrol', 'bought_cond__0': 'Imported', 'reg_city__0': 'unknown', 'engine_size__0': 3500, 'seat__0': '5', 'body_build__0': 'Sedan', 'mileage__0': 272474, 'model__0': 'ES', 'year_of_man__0': 2013, 'num_cylinder__0': 6, 'car_id__0': '5IQTDBTYmvK1tJwhdvGJfESJ', 'condition__0': 'Foreign Used', 'color__0': 'Red', 'description__