In [None]:
# Importing necessary libraries
import matplotlib.pyplot as plt
import seaborn as sns 
import pandas as pd

In [None]:
df_viz = pd.read_csv('data/data_snapshot_for_gdv.csv')

df_viz.head()

# What Borough in NYC is the most comfortable to live in?
Based on data from NYC 311 Service Requests and Median Asking Rent, I aim to explore which borough offers the best living conditions. The analysis considers both the average rent prices in each area and the number and type of complaints reported by residents.

## 1. Where is it most affordable to live?

In [None]:
df_viz_rent = df_viz.groupby('neighborhood', as_index=False)['median_rent'].median().sort_values(by='median_rent', ascending=False)

plt.figure(figsize=(16, 9))
sns.barplot(
    data=df_viz_rent,
    x="neighborhood",
    y="median_rent",
    palette="Reds_r",  # dunkler = teurer
)
plt.title("Median Asking Rent by Neighborhood", fontsize=14, pad=15)
plt.xlabel("Neighborhood")
plt.ylabel("Median Rent (USD)")
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

## 2. Where do residents complain the most? The Top 10

In [None]:
df_viz_complaints = df_viz.groupby('neighborhood', as_index=False)['complaint_count'].sum().sort_values(by='complaint_count', ascending=False)

plt.figure(figsize=(16, 9))
sns.barplot(
    data=df_viz_complaints.head(10),   # z.B. Top 10
    y="neighborhood",
    x="complaint_count",
    palette="Reds_r"
)
plt.title("Top 10 Neighborhoods by number of complaints", fontsize=14, pad=15)
plt.xlabel("Number of Complaints")
plt.ylabel("Neighborhood")
plt.tight_layout()
plt.show()

## 3. What are people complaining about the most?

In [None]:
TOP_N = 5  # oder 10

df_viz.

# 1) Sicher aggregieren
df_counts = (
    df_viz.groupby(["complaint_type", "neighborhood"], as_index=False)["complaint_count"]
          .sum()
)

# 2) Top-N Complaint Types (gesamt)
top_types = (
    df_counts.groupby("complaint_type", as_index=False)["complaint_count"]
             .sum()
             .sort_values("complaint_count", ascending=False)
             .head(TOP_N)["complaint_type"]
)

df_top = df_counts[df_counts["complaint_type"].isin(top_types)]

# 3) Pivot: Zeilen = Complaint Type, Spalten = Neighborhood
pivot_abs = (
    df_top.pivot(index="complaint_type", columns="neighborhood", values="complaint_count")
          .fillna(0)
)

# 4) Sinnvolle Sortierung:
#    - Zeilen (Complaint Types) nach Gesamtvolumen absteigend
pivot_abs = pivot_abs.loc[pivot_abs.sum(axis=1).sort_values(ascending=False).index]
#    - Optional: Spalten (Neighborhoods) nach Gesamtvolumen absteigend
pivot_abs = pivot_abs.loc[:, pivot_abs.sum(axis=0).sort_values(ascending=False).index]

# 5b) Heatmap: zeilenweise normalisiert (% Anteile je Complaint Type über Neighborhoods)
pivot_pct = pivot_abs.div(pivot_abs.sum(axis=1), axis=0) * 100

plt.figure(figsize=(16, 9))
sns.heatmap(
    pivot_pct,
    cmap="Reds",
    cbar_kws={"label": "% share per complaint type"},
    linewidths=0.5,
    linecolor="white"
)
plt.title("Top Complaint Types × Neighborhood (% share per type)")
plt.xlabel("Neighborhood")
plt.ylabel("Complaint Type")
plt.tight_layout()
plt.show()

## 4. How is the correlation between rent and complaint

## 5. Where on the map is rent the highest and complaints the most severe
    - Need for catogorizing the complaints