# libs_daw Demonstration - Step-by-Step Usage

This notebook demonstrates simplified step-by-step usage of the libs_daw library for processing NYC data.

## Library Import

In [None]:
# Import simplified pipeline
from libs_daw import NYCDataPipeline
import pandas as pd

print("Simplified libs_daw library successfully imported!")

## Step-by-Step Pipeline Usage

We use 4 main data processing steps:

In [None]:
# Pipeline initialization
pipeline = NYCDataPipeline()

print("Pipeline initialized!")

### Step 1: Data Loading

In [None]:
# Load all required data
df_nyc_311, df_median_rent, uhf_data, manual_map = pipeline.load_data()

### Step 2: Data Cleaning

In [None]:
# Clean both datasets
df_nyc_311_cleaned, df_median_rent_cleaned = pipeline.clean_data(
    df_nyc_311, df_median_rent
)

### Step 3: Data Transformation

In [None]:
# Transform data and create new features
df_nyc_311_transformed, df_median_rent_transformed = pipeline.transform_data(
    df_nyc_311_cleaned, df_median_rent_cleaned, uhf_data, manual_map
)

### Step 4: Aggregation and Integration

In [None]:
# Combine and create final dataset
final_dataset = pipeline.aggregate_and_integrate(
    df_nyc_311_transformed, df_median_rent_transformed
)

print(f"\nFinal dataset created: {final_dataset.shape}")

## Results Overview

In [None]:
# Review final dataset structure
print("=== FINAL DATASET STRUCTURE ===")
print(f"Size: {final_dataset.shape}")
print(f"Columns: {list(final_dataset.columns)}")

print("\n=== DATA SAMPLE ===")
display(final_dataset.head(10))

In [None]:
# Basic statistics
print("=== BASIC STATISTICS ===")
print(f"Total records: {len(final_dataset):,}")
print(f"Unique neighborhoods: {final_dataset['neighborhood'].nunique()}")
print(f"Unique complaint types: {final_dataset['complaint_type'].nunique()}")
print(f"Year range: {final_dataset['year'].min()}-{final_dataset['year'].max()}")

print("\n=== DATA COMPLETENESS ===")
completeness = (final_dataset.notna().sum() / len(final_dataset) * 100).round(1)
for col, pct in completeness.items():
    print(f"{col}: {pct}%")

In [None]:
final_dataset
