In [84]:
import pandas as pd
from scipy.stats import zscore
import numpy as np

df = pd.read_csv("combined_data.csv")

In [85]:
columns = df.columns.tolist()
for i in columns:
    print(i)

year
incident_hour
city
population
offense_category_name
criminal_act_name
location_area


In [86]:
df['offense_category_name'].describe()

count                      33302
unique                        12
top       Drug/Narcotic Offenses
freq                       12127
Name: offense_category_name, dtype: object

In [87]:
features_considered = ['year', 'incident_hour', 'population']
z_scores = zscore(df[features_considered], nan_policy='omit')

In [88]:
z_scores_df = pd.DataFrame(z_scores, columns=features_considered)


In [89]:
z_scores_df.head()

Unnamed: 0,year,incident_hour,population
0,-0.984505,0.760692,-1.583412
1,-0.984505,-0.175194,-1.473851
2,-0.984505,-0.175194,-1.442791
3,-0.984505,-0.955099,-1.442791
4,-0.984505,1.072654,-1.442791


In [90]:
# convert to absolute values
abs_z_scores = np.abs(z_scores)
# filter based on |z-score| less than 3
filtered_entries = (abs_z_scores < 3).all(axis=1) & df['offense_category_name'].notna()


In [91]:
filtered_entries

0        True
1        True
2        True
3        True
4        True
         ... 
33297    True
33298    True
33299    True
33300    True
33301    True
Name: offense_category_name, Length: 33302, dtype: bool

In [92]:
print("Number of outliers:", (~filtered_entries).sum())
print(df.loc[~filtered_entries])


Number of outliers: 0
Empty DataFrame
Columns: [year, incident_hour, city, population, offense_category_name, criminal_act_name, location_area]
Index: []


In [93]:
print('Number of rows before filtering outliers', len(df))
df_without_outliers = df[filtered_entries]
print('Number of rows after filtering outliers', len(df_without_outliers))
df_without_outliers.head()

Number of rows before filtering outliers 33302
Number of rows after filtering outliers 33302


Unnamed: 0,year,incident_hour,city,population,offense_category_name,criminal_act_name,location_area
0,2022,18,Ledyard,15340,Animal Cruelty,Intentional Abuse and Torture,Drug Store/Doctor's Office/Hospital
1,2022,12,Berlin,20109,Animal Cruelty,Intentional Abuse and Torture,Residence/Home
2,2022,12,Bloomfield,21461,Animal Cruelty,Simple/Gross Neglect,Farm Facility
3,2022,7,Bloomfield,21461,Animal Cruelty,Simple/Gross Neglect,Residence/Home
4,2022,20,Bloomfield,21461,Animal Cruelty,Intentional Abuse and Torture,Residence/Home


In [94]:
df.loc[z_scores_df['incident_hour'].abs() > 2]['city'].value_counts()


city
New Britain    220
Hartford       172
Stamford       153
New Haven      150
Waterbury       85
              ... 
Stonington       1
Torrington       1
Glastonbury      1
Suffield         1
Wilton           1
Name: count, Length: 65, dtype: int64

In [98]:
df_without_outliers.drop(columns=["population"], inplace=True)
df_without_outliers.to_csv("combined_data.csv", index=False)
print()




In [99]:
df_without_outliers

Unnamed: 0,year,incident_hour,city,offense_category_name,criminal_act_name,location_area
0,2022,18,Ledyard,Animal Cruelty,Intentional Abuse and Torture,Drug Store/Doctor's Office/Hospital
1,2022,12,Berlin,Animal Cruelty,Intentional Abuse and Torture,Residence/Home
2,2022,12,Bloomfield,Animal Cruelty,Simple/Gross Neglect,Farm Facility
3,2022,7,Bloomfield,Animal Cruelty,Simple/Gross Neglect,Residence/Home
4,2022,20,Bloomfield,Animal Cruelty,Intentional Abuse and Torture,Residence/Home
...,...,...,...,...,...,...
33297,2023,17,Groton Town,Weapon Law Violations,Possessing/Concealing,Parking/Drop Lot/Garage
33298,2023,19,Groton Town,Weapon Law Violations,Possessing/Concealing,Residence/Home
33299,2023,12,Mashantucket Pequot Tribal,Weapon Law Violations,Possessing/Concealing,Restaurant
33300,2023,1,Mashantucket Pequot Tribal,Weapon Law Violations,Using/Consuming,Parking/Drop Lot/Garage
