In [22]:
# in wprdc there are 66485 entries, but somehow the ArrestData.csv only has 32001 entries, so i retrieved again using the API 
# to make sure that we have all the updated data
import requests
import pandas as pd

base_url = "https://data.wprdc.org/api/3/action/datastore_search"
resource_id = "e03a89dd-134a-4ee8-a2bd-62c40aeebc6f"
limit = 1000
offset = 0
all_records = []

while True:
    params = {
        "resource_id": resource_id,
        "limit": limit,
        "offset": offset
    }
    response = requests.get(base_url, params=params)
    data = response.json()
    records = data["result"]["records"]
    if not records:
        break
    all_records.extend(records)
    offset += limit

df = pd.DataFrame(all_records)


In [2]:
# group by year by neighborhood
import pandas as pd

df = pd.read_csv("ArrestData_updated.csv", parse_dates=["ARRESTTIME"])
df["YEAR"] = df["ARRESTTIME"].dt.year
summary = df.groupby(["INCIDENTNEIGHBORHOOD", "YEAR"]).size().reset_index(name="ARREST_COUNT")
summary = summary.sort_values(by=["INCIDENTNEIGHBORHOOD", "YEAR"])
summary.to_csv("cleaned_arrested.csv")
print(summary)

    INCIDENTNEIGHBORHOOD  YEAR  ARREST_COUNT
0       Allegheny Center  2016            82
1       Allegheny Center  2017           216
2       Allegheny Center  2018           161
3       Allegheny Center  2019           239
4       Allegheny Center  2020           155
..                   ...   ...           ...
784              Windgap  2019            15
785              Windgap  2020            17
786              Windgap  2021            13
787              Windgap  2022            20
788              Windgap  2023            13

[789 rows x 3 columns]


In [3]:
# i tried to calculate the correlation but got high p-values, showing that the result is not statistically significant, so i 
# switch to calculate the total counts, and assign acores for each neighborhoods.
from scipy.stats import spearmanr

results = []

for hood in summary["INCIDENTNEIGHBORHOOD"].unique():
    sub_df = summary[summary["INCIDENTNEIGHBORHOOD"] == hood]

    if len(sub_df) >= 4:
        rho, p = spearmanr(sub_df["YEAR"], sub_df["ARREST_COUNT"])
        results.append({
            "Neighborhood": hood,
            "Spearman_rho": round(rho, 3),
            "p_value": round(p, 4),
            "Years of data": len(sub_df)
        })

spearman_df = pd.DataFrame(results).sort_values(by="Spearman_rho", ascending=False)
spearman_df["check_pval"] = spearman_df["p_value"]<0.05
spearman_df.to_csv("see.csv")
print(spearman_df.head(10)) 




                 Neighborhood  Spearman_rho  p_value  Years of data  \
16  Central Business District         0.584   0.0765             10   
15                    Carrick         0.532   0.0920             11   
14       California-Kirkbride         0.502   0.1397             10   
7                   Beechview         0.491   0.1252             11   
70                  Ridgemont         0.439   0.2763              8   
18         Central North Side         0.406   0.2443             10   
28               East Liberty         0.401   0.2505             10   
75           South Side Flats         0.401   0.2505             10   
93                    Windgap         0.361   0.3393              9   
88                 Upper Hill         0.333   0.3466             10   

    check_pval  
16       False  
15       False  
14       False  
7        False  
70       False  
18       False  
28       False  
75       False  
93       False  
88       False  


In [4]:
# here's the score!
result = df.groupby("INCIDENTNEIGHBORHOOD").size().reset_index(name="ARREST_COUNT")
result["score"] = ((6648-result["ARREST_COUNT"])/6648)*100
result = result.sort_values(by="score", ascending=False)

print(result)


         INCIDENTNEIGHBORHOOD  ARREST_COUNT      score
57    Mt. Oliver Neighborhood             2  99.969916
91     Troy Hill-Herrs Island             6  99.909747
56            Mt. Oliver Boro            18  99.729242
19          Central Northside            23  99.654031
72              Regent Square            37  99.443442
..                        ...           ...        ...
43             Homewood South          2090  68.561974
26             East Allegheny          2217  66.651625
15                    Carrick          2352  64.620939
78           South Side Flats          3497  47.397714
16  Central Business District          4440  33.212996

[98 rows x 3 columns]


In [None]:
# somehow we got 97 neighborhoods...This is a REALLY messy dataset...
mine=pd.read_csv("/Users/mkay11/Catnip-final-project-1/Makayla - 311 data analysis/311_counts_by_neighborhood.csv")
array1 = mine["hood"].unique()

In [6]:
this = pd.read_csv("cleaned_arrested.csv")
array2 = this["INCIDENTNEIGHBORHOOD"].unique()


In [7]:
# Got super weird names...What in the world is "Friendship" doing here?
import numpy as np
diff2 = np.setdiff1d(array2, array1)
print("In array2 but not array1:", diff2)


In array2 but not array1: ['Central North Side' 'Chartiers City' 'Friendship'
 'Golden Triangle/Civic Arena' 'Mount Oliver' 'Mt. Oliver Boro'
 'Mt. Oliver Neighborhood' 'Outside City' 'Outside County' 'Outside State'
 'Troy Hill-Herrs Island']


In [8]:
diff1 = np.setdiff1d(array1, array2)
print("In array1 but not array2:", diff1)

In array1 but not array2: ['Mt. Oliver']


In [9]:
# merge all Mt.Oliver, drop other names
neighborhoods_to_drop = [
    "Central North Side", "Chartiers City", "Friendship",
    "Golden Triangle/Civic Arena", "Outside City", "Outside County",
    "Outside State", "Troy Hill-Herrs Island"
]
result = result[~result["INCIDENTNEIGHBORHOOD"].isin(neighborhoods_to_drop)].copy()
name_map = {
    "Mount Oliver": "Mt. Oliver",
    "Mt. Oliver Boro": "Mt. Oliver",
    "Mt. Oliver Neighborhood": "Mt. Oliver",
}
result["INCIDENTNEIGHBORHOOD"] = result["INCIDENTNEIGHBORHOOD"].replace(name_map)







In [None]:
# recalculating for Mt.Oliver
result = result.groupby("INCIDENTNEIGHBORHOOD", as_index=False).agg({
    "ARREST_COUNT": "sum"
})
result["score"] = ((6648 - result["ARREST_COUNT"]) / 6648) * 100

result = result.sort_values(by="score",ascending=False)
result.to_csv("result.csv")

print(result)

         INCIDENTNEIGHBORHOOD  ARREST_COUNT      score
18          Central Northside            23  99.654031
64                  Ridgemont            37  99.443442
63              Regent Square            37  99.443442
52              New Homestead            39  99.413357
79             Swisshelm Park            43  99.353189
..                        ...           ...        ...
39             Homewood South          2090  68.561974
24             East Allegheny          2217  66.651625
15                    Carrick          2352  64.620939
69           South Side Flats          3497  47.397714
16  Central Business District          4440  33.212996

[88 rows x 3 columns]
