In [None]:
import pandas as pd

In [None]:
df_nyc_311 = pd.read_csv('data/nyc_311_2024_2025_sample.csv', index_col="unique_key")
df_median_rent = pd.read_csv('data/medianAskingRent_All.csv')

In [None]:
print(df_nyc_311.shape)
print(df_median_rent.shape)

In [None]:
df_median_rent.head()

In [None]:
df_nyc_311.head(5)

In [None]:
list_of_relevant_columns = ['created_date', 'closed_date', 'complaint_type',
                            'descriptor', 'status', 'resolution_description',
                            'resolution_action_updated_date', 'borough',
                            'community_board', 'incident_zip', 
                            'incident_address', 'street_name', 'city',
                            'latitude', 'longitude']

df_nyc_311_selected = df_nyc_311[list_of_relevant_columns]

In [None]:
date_columns = [col for col in df_median_rent.columns if col.startswith('2024') or col.startswith('2025')]
df_median_rent_selected = df_median_rent[df_median_rent.columns[:3].to_list() + date_columns].copy()

In [None]:
df_nyc_311_selected.head()

In [None]:
df_nyc_311_selected.describe(include='all')

In [None]:
# Analysis of missing values with percentages
missing_values = df_nyc_311_selected.isna().sum().sort_values(ascending=False)
missing_percentage = (df_nyc_311_selected.isna().sum() / len(df_nyc_311_selected) * 100).sort_values(ascending=False)

# Creating a DataFrame for convenient display
missing_data = pd.DataFrame({
    'Missing_Count': missing_values,
    'Missing_Percentage': missing_percentage
})

# Only show columns with empty values
missing_data = missing_data[missing_data['Missing_Count'] > 0]

print(f"Total number of rows in the dataset: {len(df_nyc_311)}")
print("\nAnalysis of empty values:")
print(missing_data.round(2))

In [None]:
print( "Duplicate rows in rent data:",df_median_rent_selected.duplicated().sum())
print( "Duplicate rows in 311 data:",df_nyc_311_selected.duplicated().sum())

In [None]:
df_nyc_311_selected = df_nyc_311_selected.drop_duplicates()
df_nyc_311_selected.shape

In [None]:
# Convert date columns to datetime format
df_nyc_311_selected['created_date'] = pd.to_datetime(df_nyc_311_selected['created_date'], errors='coerce')
df_nyc_311_selected['closed_date'] = pd.to_datetime(df_nyc_311_selected['closed_date'], errors='coerce')
df_nyc_311_selected['resolution_action_updated_date'] = pd.to_datetime(df_nyc_311_selected['resolution_action_updated_date'], errors='coerce')

print(f"Number of rows with created_date > closed_date, that will be removed: {df_nyc_311_selected[df_nyc_311_selected['created_date'] > df_nyc_311_selected['closed_date']].shape[0]}")
df_nyc_311_selected = df_nyc_311_selected[(df_nyc_311_selected['created_date'] <= df_nyc_311_selected['closed_date']) | (df_nyc_311_selected['closed_date'].isna())]

In [None]:
unique_counts = df_nyc_311_selected.nunique()
columns_to_check = unique_counts[unique_counts < 1000].index.to_list()

for col in columns_to_check:
    print(f"Value counts for column '{col}':")
    print(df_nyc_311_selected[col].value_counts())
    print("\n")

In [None]:
columns_to_check = ['complaint_type', 'descriptor', 'resolution_description', 'community_board', 'city']

In [None]:
# Standardize 'city' entries: trim whitespace and convert to uppercase
df_nyc_311_selected['city'] = df_nyc_311_selected['city'].str.strip().str.upper()

# Replace known outside NYC locations with 'OUTSIDE NYC'
outside = ['FLORAL PARK', 'NEW HYDE PARK', 'BREEZY POINT']
df_nyc_311_selected['city'] = df_nyc_311_selected['city'].replace(outside, 'OUTSIDE NYC')

In [None]:
import json

# Download the JSON file with zip code mapping to neighborhoods
with open('nyc_uhf_zipcodes.json', 'r') as f:
    uhf_data = json.load(f)

# Creating a dictionary for mapping zip codes to neighborhoods
zip_to_neighborhood = {}

for borough, neighborhoods in uhf_data.items():
    for neighborhood_info in neighborhoods:
        neighborhood_name = neighborhood_info['neighborhood']
        zip_codes = neighborhood_info['zip_codes']
        
        for zip_code in zip_codes:
            zip_to_neighborhood[zip_code] = neighborhood_name

In [None]:
# Create incident_zip_str column for mapping and create neighborhood column
df_nyc_311_selected['incident_zip_str'] = df_nyc_311_selected['incident_zip'].fillna(0).astype(int).astype(str).str.zfill(5)
df_nyc_311_selected.loc[df_nyc_311_selected['incident_zip'].isna(), 'incident_zip_str'] = None

df_nyc_311_selected['neighborhood'] = df_nyc_311_selected['incident_zip_str'].map(zip_to_neighborhood)

# Check the results
print(f"Mapping results:")
print(f"Number of records from neighborhood: {df_nyc_311_selected['neighborhood'].notna().sum()}")
print(f"Number of records without neighborhood: {df_nyc_311_selected['neighborhood'].isna().sum()}")
print(f"Coverage percentage: {(df_nyc_311_selected['neighborhood'].notna().sum() / len(df_nyc_311_selected) * 100):.2f}%")

# Drop the temporary incident_zip_str column
df_nyc_311_selected = df_nyc_311_selected.drop('incident_zip_str', axis=1)

In [None]:
# Count nubmers of complaints per neighborhood
complaints_by_neighborhood = df_nyc_311_selected.groupby(['neighborhood', 'complaint_type']).size().reset_index(name='counts').sort_values(by=['neighborhood', 'counts'], ascending=[True, False])

In [None]:
len(df_nyc_311_selected['neighborhood'].unique())

In [None]:
len(df_median_rent_selected['areaName'].unique())

In [None]:
# Load mappining areaName to neighborhood
with open('manual_map.json', 'r') as f:
    manual_map = json.load(f)

df_median_rent_selected['neighborhood'] = df_median_rent_selected['areaName'].str.lower().map(manual_map)

In [None]:
# Get rent stats per neighborhood
date_columns = [col for col in df_median_rent_selected.columns if col.startswith('2024') or col.startswith('2025')]

df_median_rent_selected['annual_avg_rent'] = df_median_rent_selected[date_columns].mean(axis=1)
df_median_rent_selected['annual_median_rent'] = df_median_rent_selected[date_columns].median(axis=1)
df_median_rent_selected['annual_std_rent'] = df_median_rent_selected[date_columns].std(axis=1)


neighborhood_rent_stats = df_median_rent_selected.groupby('neighborhood').agg({
    'annual_avg_rent': 'mean',
    'annual_median_rent': 'mean', 
    'annual_std_rent': 'mean'
}).reset_index()

In [None]:
df_merged = pd.merge(complaints_by_neighborhood, neighborhood_rent_stats, on='neighborhood', how='left')
df_merged