In [1]:
import pandas as pd
import time
import os

In [2]:
#The dataset is very big and pandas has trouble assigning dtypes to the columns automatically because of bad data quality. We assign dtype=str and cast manually.
from ETL.load import load

In [3]:
#We remove columns that have too many nulls, like max_nulls = 0,5  so 50%
from ETL.remove_too_empty_cols import remove_too_empty_cols

In [4]:
#Check unique IDs. Remove duplicate entries if existing
from ETL.remove_duplicated_IDs import remove_duplicated_IDs

In [5]:
from ETL.dates_to_datetime import dates_to_datetime

In [6]:
from ETL.remove_invalid_zip_codes import remove_invalid_zip_codes

In [7]:
from ETL.remove_invalid_status import remove_invalid_status

In [8]:
from ETL.polish_strings import polish_strings

In [9]:
from tests.final_validation import final_validation

In [10]:
def main():
    t0 = time.time()
    nrows = int(os.getenv("NROWS", "20000")) #Selects from real dataset and sample (for github CI deploy)
    df = (load(nrows)
    .pipe(remove_too_empty_cols,0.5)
    .pipe(remove_duplicated_IDs)
    .pipe(dates_to_datetime)
    .pipe(remove_invalid_zip_codes)
    .pipe(remove_invalid_status)
    .pipe(polish_strings)
    .pipe(final_validation)
    )
    dt = round(time.time() - t0,2)
    print(f'Your data is ready, processed in {dt} seconds, resulting in {df.shape[0]} rows and {df.shape[1]} columns')
    df.to_csv('311_NYC_requests_clean.csv')
    
main()

Loaded sample of the total dataset, with some of the total columns only
==>Removing too empty columns: removed {'Intersection Street 1', 'Vehicle Type', 'School or Citywide Complaint', 'Intersection Street 2', 'Ferry Direction', 'Taxi Company Borough', 'Landmark', 'Bridge Highway Name', 'Ferry Terminal Name', 'Bridge Highway Segment', 'Road Ramp', 'Garage Lot Name', 'Bridge Highway Direction', 'Taxi Pick Up Location'}
==>No duplicated IDs to remove
==>Casting dates as datetime format
==>Step check zip codes: removed incident zip codes [nan '00083']
==>No invalid status to remove
==>All string columns have been standarized
==> Correct schema validation with Pandera. Checked nullability, data types, categorical values, and unique IDs.
Your data is ready, processed in 0.51 seconds, resulting in 19859 rows and 12 columns


top-level pandera module will be **removed in a future version of pandera**.
If you're using pandera to validate pandas objects, we highly recommend updating
your import:

```
# old import
import pandera as pa

# new import
import pandera.pandas as pa
```

If you're using pandera to validate objects from other compatible libraries
like pyspark or polars, see the supported libraries section of the documentation
for more information on how to import pandera:

https://pandera.readthedocs.io/en/stable/supported_libraries.html


```
```

