# NYC Data Wrangling Pipeline

This notebook performs comprehensive data wrangling on NYC 311 complaint data and median rent data to create a unified dataset for analysis.

## Pipeline Structure:
1. **Data Import** - Load required libraries and datasets
2. **Data Cleaning** - Handle missing values, duplicates, and data quality issues
3. **Data Transformation** - Create new features and filter data
4. **Data Integration** - Merge datasets and create final output

## 1. Data Import

### Import Required Libraries

In [None]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd
import json

### Load Datasets

In [None]:
# Load NYC 311 complaints data and median rent data
df_nyc_311 = pd.read_csv('data/nyc_311_2024_2025_sample.csv', index_col="unique_key")
df_median_rent = pd.read_csv('data/medianAskingRent_All.csv')

print(f"NYC 311 data shape: {df_nyc_311.shape}")
print(f"Median rent data shape: {df_median_rent.shape}")

### Load Mapping Files

In [None]:
# Load ZIP code to neighborhood mapping
with open('nyc_uhf_zipcodes.json', 'r') as f:
    uhf_data = json.load(f)

# Load manual mapping for area names to neighborhoods
with open('manual_map.json', 'r') as f:
    manual_map = json.load(f)

### Initial Data Exploration

In [None]:
# Display basic information about the datasets
print("=== NYC 311 Dataset Sample ===")
print(df_nyc_311.head())
print("\n=== Median Rent Dataset Sample ===")
print(df_median_rent.head())

## 2. Data Cleaning

### NYC 311 Data - Column Selection and Initial Cleaning

In [None]:
# Select relevant columns for analysis
list_of_relevant_columns = ['created_date', 'closed_date', 'complaint_type',
                            'descriptor', 'status', 'resolution_description',
                            'resolution_action_updated_date', 'borough',
                            'community_board', 'incident_zip', 
                            'incident_address', 'street_name', 'city',
                            'latitude', 'longitude']

df_nyc_311_selected = df_nyc_311[list_of_relevant_columns].copy()
print(f"Selected {len(list_of_relevant_columns)} columns from NYC 311 data")

### Median Rent Data - Column Selection

In [None]:
# Select relevant date columns (2024-2025) and basic info columns
date_columns = [col for col in df_median_rent.columns if col.startswith('2024') or col.startswith('2025')]
df_median_rent_selected = df_median_rent[df_median_rent.columns[:3].to_list() + date_columns].copy()
print(f"Selected {len(date_columns)} date columns plus 3 info columns from rent data")

### Missing Values Analysis

In [None]:
# Analyze missing values in NYC 311 data
missing_values = df_nyc_311_selected.isna().sum().sort_values(ascending=False)
missing_percentage = (df_nyc_311_selected.isna().sum() / len(df_nyc_311_selected) * 100).sort_values(ascending=False)

missing_data = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

# Only show columns with missing values
missing_data = missing_data[missing_data['Missing_Count'] > 0]

print(f"Total number of rows in NYC 311 dataset: {len(df_nyc_311_selected)}")
print("\nMissing values analysis:")
missing_data.round(2)

In [None]:
# Analyze missing values in rent data
missing_values_rent = df_median_rent_selected.isna().sum().sort_values(ascending=False)
missing_percentage_rent = (df_median_rent_selected.isna().sum() / len(df_median_rent_selected) * 100).sort_values(ascending=False)

missing_data_rent = pd.DataFrame({
    'Missing_Count': missing_values_rent,
    'Missing_Percentage': missing_percentage_rent
})

missing_data_rent = missing_data_rent[missing_data_rent['Missing_Count'] > 0]

print(f"Total number of rows in rent dataset: {len(df_median_rent_selected)}")
print("\nMissing values analysis for rent data:")
missing_data_rent.round(2)

### Duplicate Removal

In [None]:
# Check and remove duplicate rows
print(f"Duplicate rows in rent data: {df_median_rent_selected.duplicated().sum()}")
print(f"Duplicate rows in 311 data: {df_nyc_311_selected.duplicated().sum()}")

# Remove duplicates from NYC 311 data
original_shape = df_nyc_311_selected.shape
df_nyc_311_selected = df_nyc_311_selected.drop_duplicates()
print(f"Removed {original_shape[0] - df_nyc_311_selected.shape[0]} duplicate rows from NYC 311 data")
print(f"New shape: {df_nyc_311_selected.shape}")

### Date Data Cleaning

In [None]:
# Convert date columns to datetime format
df_nyc_311_selected['created_date'] = pd.to_datetime(df_nyc_311_selected['created_date'], errors='coerce')
df_nyc_311_selected['closed_date'] = pd.to_datetime(df_nyc_311_selected['closed_date'], errors='coerce')
df_nyc_311_selected['resolution_action_updated_date'] = pd.to_datetime(df_nyc_311_selected['resolution_action_updated_date'], errors='coerce')

# Remove invalid date records (created_date > closed_date)
invalid_dates = df_nyc_311_selected[df_nyc_311_selected['created_date'] > df_nyc_311_selected['closed_date']]
print(f"Number of rows with created_date > closed_date (will be removed): {invalid_dates.shape[0]}")

df_nyc_311_selected = df_nyc_311_selected[
    (df_nyc_311_selected['created_date'] <= df_nyc_311_selected['closed_date']) | 
    (df_nyc_311_selected['closed_date'].isna())
]

print(f"Final NYC 311 data shape after date cleaning: {df_nyc_311_selected.shape}")

### Geographic Data Cleaning

In [None]:
# Standardize city names: trim whitespace and convert to uppercase
df_nyc_311_selected['city'] = df_nyc_311_selected['city'].str.strip().str.upper()

# Replace known outside NYC locations with 'OUTSIDE NYC'
outside_nyc_locations = ['FLORAL PARK', 'NEW HYDE PARK', 'BREEZY POINT']
df_nyc_311_selected['city'] = df_nyc_311_selected['city'].replace(outside_nyc_locations, 'OUTSIDE NYC')

print("City names standardized")
print(f"Unique cities after cleaning: {df_nyc_311_selected['city'].nunique()}")

## 3. Data Transformation

### Feature Engineering - NYC 311 Data

In [None]:
# Calculate resolution time in hours
df_nyc_311_selected['resolution_time_hours'] = (
    df_nyc_311_selected['closed_date'] - df_nyc_311_selected['created_date']
).dt.total_seconds() / 3600

# Extract month and year from created_date
df_nyc_311_selected['month'] = df_nyc_311_selected['created_date'].dt.month
df_nyc_311_selected['year'] = df_nyc_311_selected['created_date'].dt.year

print("Created new features: resolution_time_hours, month, year")
print(f"Resolution time statistics:")
print(df_nyc_311_selected['resolution_time_hours'].describe())

### Geographic Mapping - ZIP to Neighborhood

In [None]:
# Create ZIP code to neighborhood mapping dictionary
zip_to_neighborhood = {}

for borough, neighborhoods in uhf_data.items():
    for neighborhood_info in neighborhoods:
        neighborhood_name = neighborhood_info['neighborhood']
        zip_codes = neighborhood_info['zip_codes']
        
        for zip_code in zip_codes:
            zip_to_neighborhood[zip_code] = neighborhood_name

print(f"Created mapping for {len(zip_to_neighborhood)} ZIP codes to neighborhoods")

In [None]:
# Map ZIP codes to neighborhoods for NYC 311 data
df_nyc_311_selected['incident_zip_str'] = (
    df_nyc_311_selected['incident_zip'].fillna(0).astype(int).astype(str).str.zfill(5)
)
df_nyc_311_selected.loc[df_nyc_311_selected['incident_zip'].isna(), 'incident_zip_str'] = None

df_nyc_311_selected['neighborhood'] = df_nyc_311_selected['incident_zip_str'].map(zip_to_neighborhood)

# Report mapping results
mapped_records = df_nyc_311_selected['neighborhood'].notna().sum()
total_records = len(df_nyc_311_selected)
coverage_percentage = (mapped_records / total_records * 100)

print(f"Neighborhood mapping results:")
print(f"Records with neighborhood: {mapped_records:,}")
print(f"Records without neighborhood: {total_records - mapped_records:,}")
print(f"Coverage percentage: {coverage_percentage:.2f}%")

# Clean up temporary column
df_nyc_311_selected = df_nyc_311_selected.drop('incident_zip_str', axis=1)

### Geographic Mapping - Rent Data

In [None]:
# Map area names to neighborhoods for rent data
df_median_rent_selected['neighborhood'] = df_median_rent_selected['areaName'].str.lower().map(manual_map)

# Report mapping results for rent data
mapped_rent_records = df_median_rent_selected['neighborhood'].notna().sum()
total_rent_records = len(df_median_rent_selected)

print(f"Rent data neighborhood mapping results:")
print(f"Records with neighborhood: {mapped_rent_records}")
print(f"Records without neighborhood: {total_rent_records - mapped_rent_records}")
print(f"Coverage percentage: {(mapped_rent_records / total_rent_records * 100):.2f}%")

### Data Aggregation - NYC 311 Complaints

In [None]:
# Aggregate complaints by neighborhood, complaint type, year, and month
complaints_by_neighborhood = df_nyc_311_selected.groupby(
    ['neighborhood', 'complaint_type', 'year', 'month']
).agg({
    'resolution_time_hours': ['count', 'median']
}).reset_index()

# Flatten column names
complaints_by_neighborhood.columns = [
    'neighborhood', 'complaint_type', 'year', 'month', 
    'complaint_count', 'median_resolution_time_hours'
]

# Sort by neighborhood, year, month, and complaint count
complaints_by_neighborhood = complaints_by_neighborhood.sort_values(
    by=['neighborhood', 'year', 'month', 'complaint_count'], 
    ascending=[True, True, True, False]
)

print(f"Aggregated complaints data shape: {complaints_by_neighborhood.shape}")
print(f"Unique neighborhoods in complaints: {complaints_by_neighborhood['neighborhood'].nunique()}")

### Data Aggregation - Rent Data

In [None]:
# Aggregate rent data by neighborhood (median across all areas in same neighborhood)
date_columns = [col for col in df_median_rent_selected.columns if col.startswith('2024') or col.startswith('2025')]
median_rent_by_neighborhood = df_median_rent_selected.groupby('neighborhood')[date_columns].median()

print(f"Aggregated rent data shape: {median_rent_by_neighborhood.shape}")
print(f"Unique neighborhoods in rent data: {median_rent_by_neighborhood.index.nunique()}")

### Reshape Rent Data

In [None]:
# Reshape rent data from wide to long format
rent_melted = median_rent_by_neighborhood.reset_index().melt(
    id_vars='neighborhood', 
    var_name='date', 
    value_name='median_rent'
)

# Convert date column and extract year/month
rent_melted['date'] = pd.to_datetime(rent_melted['date'])
rent_melted['year'] = rent_melted['date'].dt.year
rent_melted['month'] = rent_melted['date'].dt.month

print(f"Reshaped rent data shape: {rent_melted.shape}")
print("Sample of reshaped rent data:")
rent_melted.head()

## 4. Data Integration

### Merge Complaints and Rent Data

In [None]:
# Merge complaints and rent data on neighborhood, year, and month
df_merged_monthly = pd.merge(
    complaints_by_neighborhood, 
    rent_melted[['neighborhood', 'year', 'month', 'median_rent']], 
    on=['neighborhood', 'year', 'month'], 
    how='left'
)

print(f"Final merged dataset shape: {df_merged_monthly.shape}")
print(f"Records with rent data: {df_merged_monthly['median_rent'].notna().sum()}")
print(f"Records without rent data: {df_merged_monthly['median_rent'].isna().sum()}")

# Display sample of merged data
print("\nSample of merged data:")
df_merged_monthly.head(10)

### Data Quality Check

In [None]:
# Final data quality checks
print("=== Final Dataset Summary ===")
print(f"Total records: {len(df_merged_monthly):,}")
print(f"Date range: {df_merged_monthly['year'].min()}-{df_merged_monthly['year'].max()}")
print(f"Unique neighborhoods: {df_merged_monthly['neighborhood'].nunique()}")
print(f"Unique complaint types: {df_merged_monthly['complaint_type'].nunique()}")

print("\n=== Data Completeness ===")
completeness = (df_merged_monthly.notna().sum() / len(df_merged_monthly) * 100).round(2)
print(completeness)

print("\n=== Top 10 Neighborhoods by Complaint Volume ===")
top_neighborhoods = df_merged_monthly.groupby('neighborhood')['complaint_count'].sum().sort_values(ascending=False).head(10)
print(top_neighborhoods)

### Export Final Dataset

In [None]:
# Export the final merged dataset
output_path = 'data/data_snapshot_for_gdv.csv'
df_merged_monthly.to_csv(output_path, index=False)

print(f"Final dataset exported to: {output_path}")
print(f"Dataset shape: {df_merged_monthly.shape}")
print("\nData wrangling pipeline completed successfully!")