In [1]:
import os, logging
from dotenv import load_dotenv
from etl.extract import extract
from etl.transform import transform
from etl.load import load
import pandas as pd

In [10]:
load_dotenv()

DB_CONFIG = {
    'host': os.getenv("DB_HOST"),
    'user': os.getenv("DB_USER"),
    'password': os.getenv("DB_PASS"),
    'dbname': os.getenv("DB_NAME"),
    'port': os.getenv("DB_PORT")
}

def setup_logging():
    logging.basicConfig(
        filename='chicago_crime.log',
        level=logging.DEBUG,
        format='%(asctime)s - %(levelname)s - %(message)s',
        force=True  # ensures clean reconfiguration when reloaded
    )

setup_logging()

## For Local file with records from 2001 to Present

In [None]:
from pathlib import Path
import pandas as pd

base_data_path = Path("./data")

raw_file_name = "Crimes_-_2001_to_Present_20250403.csv"
cleaned_subset_file_name = "cleaned_subset_2024_to_Present_20250403.csv"

In [None]:
file_name = raw_file_name

df = pd.read_csv(base_data_path / file_name)

In [None]:
load(df, DB_CONFIG, "crimes")

In [None]:
df = transform(df)

## Update Data from API continuously
The following steps are repeated daily through Airflow

### Extract

In [None]:
APP_TOKEN = os.getenv("APP_TOKEN")
records = extract(DB_CONFIG, APP_TOKEN)

In [4]:
df = pd.DataFrame(records)

In [5]:
df.head(2)

Unnamed: 0,id,case_number,date,block,iucr,primary_type,description,location_description,arrest,domestic,...,ward,community_area,fbi_code,year,updated_on,x_coordinate,y_coordinate,latitude,longitude,location
0,13808754,JJ221820,2023-06-01T00:00:00.000,050XX N WINTHROP AVE,1120,DECEPTIVE PRACTICE,FORGERY,APARTMENT,False,False,...,48,3,10,2023,2025-04-18T15:42:00.000,,,,,
1,13802531,JJ213998,2025-04-10T18:00:00.000,064XX N MILWAUKEE AVE,1310,CRIMINAL DAMAGE,TO PROPERTY,COMMERCIAL / BUSINESS OFFICE,False,False,...,45,10,14,2025,2025-04-18T15:42:00.000,1132296.0,1942420.0,41.9982415,-87.788687473,"{'latitude': '41.9982415', 'longitude': '-87.7..."


### Transform

In [6]:
df = transform(df)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 635 entries, 0 to 634
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   id                    635 non-null    Int64         
 1   case_number           635 non-null    string        
 2   date                  635 non-null    datetime64[ns]
 3   block                 635 non-null    string        
 4   iucr                  635 non-null    string        
 5   primary_type          635 non-null    string        
 6   description           635 non-null    string        
 7   location_description  630 non-null    string        
 8   arrest                635 non-null    boolean       
 9   domestic              635 non-null    boolean       
 10  beat                  635 non-null    Int64         
 11  district              635 non-null    Int64         
 12  ward                  635 non-null    Int64         
 13  community_area      

### Load

In [7]:
load(df, DB_CONFIG)