# NYC Soundscape — EDA
**Dataset**: SONYC-UST v2.3 · [Zenodo 3966543](https://zenodo.org/records/3966543) · CC BY 4.0

Quick orientation on the annotations CSV before building the visualization pipeline.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import seaborn as sns
from pathlib import Path

plt.rcParams.update({
    'figure.facecolor': '#070810',
    'axes.facecolor':   '#0d0f1a',
    'axes.edgecolor':   '#1e2240',
    'axes.labelcolor':  '#e8eaf6',
    'xtick.color':      '#5a5f8a',
    'ytick.color':      '#5a5f8a',
    'text.color':       '#e8eaf6',
    'grid.color':       '#1e2240',
    'grid.linewidth':   0.5,
})

SC = {
    'engine':    '#ff6b6b',
    'machinery': '#ff9f43',
    'impact':    '#ffd32a',
    'saw':       '#26de81',
    'alert':     '#fd79a8',
    'music':     '#a29bfe',
    'voice':     '#74b9ff',
    'dog':       '#55efc4',
}

COARSE_MAP = {
    '1_engine_presence':               'engine',
    '2_machinery-impact_presence':     'machinery',
    '3_non-machinery-impact_presence': 'impact',
    '4_powered-saw_presence':          'saw',
    '5_alert-signal_presence':         'alert',
    '6_music_presence':                'music',
    '7_human-voice_presence':          'voice',
    '8_dog_presence':                  'dog',
}

BOROUGH_NAMES = {'1': 'Manhattan', '3': 'Brooklyn', '4': 'Queens'}

CSV = Path('data/metadata/annotations.csv')
df = pd.read_csv(CSV, low_memory=False)
df['borough_name'] = df['borough'].astype(str).map(BOROUGH_NAMES)
df['annotator_type'] = df['annotator_id'].apply(
    lambda x: 'ground_truth' if x == 0 else ('sonyc_team' if x < 0 else 'zooniverse')
)
print(f'Rows: {len(df):,}   Columns: {df.shape[1]}')
df.head(3)

## 1. Schema overview

In [None]:
fine_presence  = [c for c in df.columns if '_presence' in c and c[0].isdigit() and '-' in c]
coarse_presence= [c for c in df.columns if '_presence' in c and c[0].isdigit() and '-' not in c]
proximity_cols = [c for c in df.columns if '_proximity' in c]

print(f'Coarse presence cols : {len(coarse_presence)}')
print(f'Fine presence cols   : {len(fine_presence)}')
print(f'Proximity cols       : {len(proximity_cols)}')
print()
# Value distribution: 0 / 1 / -1
val_counts = {}
for c in coarse_presence:
    vc = df[c].value_counts().to_dict()
    val_counts[c.replace('_presence','')] = vc

vc_df = pd.DataFrame(val_counts).T.fillna(0).astype(int)
vc_df.columns = [str(c) for c in vc_df.columns]
print('Value counts (0=absent, 1=present, -1=unlabeled by this annotator):')
vc_df

## 2. Coverage

In [None]:
print('Split distribution:')
print(df['split'].value_counts().to_string())
print()
print('Borough (1=Manhattan 3=Brooklyn 4=Queens):')
print(df['borough'].value_counts().sort_index().to_string())
print()
print('Annotator type:')
print(df['annotator_type'].value_counts().to_string())
print()
print(f"Unique sensors : {df['sensor_id'].nunique()}")
print(f"Unique clips   : {df['audio_filename'].nunique()}")
print(f"Year range     : {df['year'].min()} - {df['year'].max()}")
print(f"Hour range     : {df['hour'].min()} - {df['hour'].max()}")

In [None]:
# Annotation count heatmap: borough x hour
pivot = (df[df['borough_name'].notna()]
         .groupby(['borough_name','hour'])
         .size().unstack(fill_value=0))

fig, ax = plt.subplots(figsize=(14, 3))
sns.heatmap(pivot, ax=ax, cmap='Blues', linewidths=0.3,
            cbar_kws={'label': 'annotation rows'})
ax.set_title('Annotation count — borough x hour of day', fontsize=13)
ax.set_xlabel('Hour of day')
ax.set_ylabel('')
plt.tight_layout()
plt.show()

## 3. Sound prevalence

In [None]:
# Clean coarse-presence frame: replace -1 with NaN
cp = df[list(COARSE_MAP.keys())].replace(-1, np.nan).rename(columns=COARSE_MAP).copy()
cp['hour'] = df['hour']
cp['borough_name'] = df['borough_name']
sounds = list(COARSE_MAP.values())

overall = cp[sounds].mean().sort_values(ascending=False)

fig, ax = plt.subplots(figsize=(9, 4))
bars = ax.barh(overall.index, overall.values,
               color=[SC[k] for k in overall.index], alpha=0.85)
ax.set_xlabel('Prevalence rate')
ax.set_title('Overall sound class prevalence', fontsize=13)
ax.xaxis.set_major_formatter(mticker.PercentFormatter(1.0))
ax.grid(axis='x')
for bar, val in zip(bars, overall.values):
    ax.text(val + 0.003, bar.get_y() + bar.get_height()/2,
            f'{val:.1%}', va='center', fontsize=9)
plt.tight_layout()
plt.show()

In [None]:
# Prevalence by hour — global
hourly = cp.groupby('hour')[sounds].mean()

fig, ax = plt.subplots(figsize=(14, 5))
for sound, color in SC.items():
    ax.plot(hourly.index, hourly[sound], color=color,
            linewidth=2, label=sound, alpha=0.9)
ax.set_xlabel('Hour of day')
ax.set_ylabel('Prevalence rate')
ax.set_title('Sound prevalence by hour (all boroughs)', fontsize=13)
ax.yaxis.set_major_formatter(mticker.PercentFormatter(1.0))
ax.set_xticks(range(24))
ax.legend(loc='upper left', framealpha=0.2, fontsize=9, ncol=4)
ax.grid()
plt.tight_layout()
plt.show()

In [None]:
# Prevalence by borough x hour — small multiples
boroughs = ['Manhattan', 'Brooklyn', 'Queens']
fig, axes = plt.subplots(1, 3, figsize=(16, 4), sharey=True)

for ax, bname in zip(axes, boroughs):
    sub = cp[cp['borough_name'] == bname].groupby('hour')[sounds].mean()
    for sound, color in SC.items():
        ax.plot(sub.index, sub[sound], color=color,
                linewidth=2, label=sound, alpha=0.85)
    ax.set_title(bname, fontsize=12)
    ax.set_xlabel('Hour')
    ax.set_xticks(range(0, 24, 3))
    ax.yaxis.set_major_formatter(mticker.PercentFormatter(1.0))
    ax.grid()

axes[0].set_ylabel('Prevalence rate')
handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='lower center', ncol=8,
           framealpha=0.2, fontsize=9, bbox_to_anchor=(0.5, -0.08))
fig.suptitle('Sound prevalence by hour — per borough', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()

## 4. Sensor map

In [None]:
sensors = (df[['sensor_id','latitude','longitude','borough_name']]
           .drop_duplicates('sensor_id').dropna())
borough_colors = {'Manhattan': '#74b9ff', 'Brooklyn': '#a29bfe', 'Queens': '#fd79a8'}

fig, ax = plt.subplots(figsize=(7, 8))
for bname, grp in sensors.groupby('borough_name'):
    ax.scatter(grp['longitude'], grp['latitude'],
               c=borough_colors.get(bname, 'white'), s=40, alpha=0.8,
               label=f'{bname} ({len(grp)})', zorder=3)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
ax.set_title(f'SONYC sensor locations ({len(sensors)} sensors)', fontsize=13)
ax.legend(framealpha=0.2)
ax.grid()
plt.tight_layout()
plt.show()

## 5. Fine-grained class breakdown

In [None]:
fine = df[fine_presence].replace(-1, np.nan)
fine_prev = fine.mean().sort_values(ascending=True)

def coarse_color(col_name):
    prefix = col_name.split('_')[0].split('-')[0]
    mapping = {'1':'engine','2':'machinery','3':'impact','4':'saw',
               '5':'alert','6':'music','7':'voice','8':'dog'}
    return SC.get(mapping.get(prefix, ''), '#888')

colors = [coarse_color(c) for c in fine_prev.index]
labels = [c.replace('_presence','').replace('_',' ') for c in fine_prev.index]

fig, ax = plt.subplots(figsize=(9, 10))
ax.barh(labels, fine_prev.values, color=colors, alpha=0.85)
ax.set_xlabel('Prevalence rate')
ax.set_title('Fine-grained class prevalence', fontsize=13)
ax.xaxis.set_major_formatter(mticker.PercentFormatter(1.0))
ax.grid(axis='x')
plt.tight_layout()
plt.show()

## 6. Scratch pad

In [None]:
# Example: engine prevalence by annotator type
eng = df.groupby('annotator_type')['1_engine_presence'].apply(
    lambda x: x[x != -1].mean()
)
print('Engine prevalence by annotator type:')
print(eng)

In [None]:
# Your turn
