In [162]:
import pandas as pd
from scipy.stats import zscore
import numpy as np

df = pd.read_csv("combined_data.csv")

In [163]:
columns = df.columns.tolist()
for i in columns:
    print(i)

year
month
day
hour
date
dayofweek
city
population
offense_category_name
location_area


In [164]:
df['offense_category_name'].describe()

count                     380303
unique                        24
top       Larceny/Theft Offenses
freq                      124962
Name: offense_category_name, dtype: object

In [165]:
features_considered = ['year','hour', 'population']
z_scores = zscore(df[features_considered], nan_policy='omit')

In [166]:
z_scores_df = pd.DataFrame(z_scores, columns=features_considered)


In [167]:
z_scores_df.head()

Unnamed: 0,year,hour,population
0,-0.037116,0.053919,-0.743424
1,-0.037116,1.232623,-0.524328
2,-0.037116,1.232623,-0.524328
3,-0.037116,1.232623,-0.524328
4,-0.037116,0.053919,0.662834


In [168]:
# convert to absolute values
abs_z_scores = np.abs(z_scores)
# filter based on |z-score| less than 3
filtered_entries = (abs_z_scores < 3).all(axis=1) & df['offense_category_name'].notna()


In [169]:
filtered_entries

0         True
1         True
2         True
3         True
4         True
          ... 
380298    True
380299    True
380300    True
380301    True
380302    True
Name: offense_category_name, Length: 380303, dtype: bool

In [170]:
print("Number of outliers:", (~filtered_entries).sum())
print(df.loc[~filtered_entries])


Number of outliers: 13313
        year  month  day  hour        date  dayofweek  \
48      2022      9   29    14  2022-09-29          3   
58      2022     11    6     8  2022-11-06          6   
84      2022      6    3    14  2022-06-03          4   
87      2022      8    4     5  2022-08-04          3   
92      2022      9    9    21  2022-09-09          4   
...      ...    ...  ...   ...         ...        ...   
380137  2021     11   24    21  2021-11-24          2   
380140  2021     12   22     9  2021-12-22          2   
380247  2021      8   25    12  2021-08-25          2   
380253  2021     11   13     0  2021-11-13          5   
380261  2021     12   14    11  2021-12-14          1   

                            city  population   offense_category_name  \
48      Connecticut State Police      509306            Sex Offenses   
58      Connecticut State Police      509306                   Arson   
84      Connecticut State Police      509306   Weapon Law Violations   
8

In [171]:
print('Number of rows before filtering outliers', len(df))
df_without_outliers = df[filtered_entries]
print('Number of rows after filtering outliers', len(df_without_outliers))
df_without_outliers.head()

Number of rows before filtering outliers 380303
Number of rows after filtering outliers 366990


Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,offense_category_name,location_area
0,2022,9,22,13,2022-09-22,3,Berlin,20109,Drug/Narcotic Offenses,Abandoned/Condemned Structure
1,2022,6,4,20,2022-06-04,5,Norwich,40096,Weapon Law Violations,Abandoned/Condemned Structure
2,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure
3,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,Drug/Narcotic Offenses,Abandoned/Condemned Structure


In [172]:
df.loc[z_scores_df['hour'].abs() > 2]['city'].value_counts()


city
Derby          1853
New Haven      1692
Hartford       1337
New Britain     718
Waterbury       681
               ... 
Ridgefield        3
Montville         2
Weston            1
Middlebury        1
Madison           1
Name: count, Length: 96, dtype: int64

In [173]:
# 1. Count number of crimes per city
crime_counts = df['city'].value_counts().rename_axis('city').reset_index(name='crime_count')

# 2. Get population per city (assuming one population value per city in your df)
populations = df[['city', 'population']].drop_duplicates(subset='city')

# 3. Merge counts and population
city_stats = crime_counts.merge(populations, on='city')

# 4. Calculate crime rate per 1000 residents
city_stats['crime_rate_per_1000_people'] = ((city_stats['crime_count'] / city_stats['population']) * 1000).round(0).astype(int)

# 5. Optional: merge back to original df if needed
df_without_outliers = df_without_outliers.merge(city_stats[['city', 'crime_rate_per_1000_people']], on='city', how='left')


In [174]:
df_without_outliers.to_csv("combined_data.csv", index=False)
print()




In [175]:
df_without_outliers

Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,offense_category_name,location_area,crime_rate_per_1000_people
0,2022,9,22,13,2022-09-22,3,Berlin,20109,Drug/Narcotic Offenses,Abandoned/Condemned Structure,120
1,2022,6,4,20,2022-06-04,5,Norwich,40096,Weapon Law Violations,Abandoned/Condemned Structure,166
2,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166
3,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,Drug/Narcotic Offenses,Abandoned/Condemned Structure,106
...,...,...,...,...,...,...,...,...,...,...,...
366985,2021,3,13,16,2021-03-13,5,Stratford,51683,Assault Offenses,Other/Unknown,99
366986,2021,9,5,17,2021-09-05,6,Stratford,51683,Assault Offenses,Other/Unknown,99
366987,2021,12,3,14,2021-12-03,4,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99
366988,2021,10,27,11,2021-10-27,2,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99


In [176]:
df_without_outliers.columns.tolist()

['year',
 'month',
 'day',
 'hour',
 'date',
 'dayofweek',
 'city',
 'population',
 'offense_category_name',
 'location_area',
 'crime_rate_per_1000_people']

In [177]:
df_without_outliers

Unnamed: 0,year,month,day,hour,date,dayofweek,city,population,offense_category_name,location_area,crime_rate_per_1000_people
0,2022,9,22,13,2022-09-22,3,Berlin,20109,Drug/Narcotic Offenses,Abandoned/Condemned Structure,120
1,2022,6,4,20,2022-06-04,5,Norwich,40096,Weapon Law Violations,Abandoned/Condemned Structure,166
2,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166
3,2022,7,10,20,2022-07-10,6,Norwich,40096,Drug/Narcotic Offenses,Abandoned/Condemned Structure,166
4,2022,10,14,13,2022-10-14,4,Bridgeport,148395,Drug/Narcotic Offenses,Abandoned/Condemned Structure,106
...,...,...,...,...,...,...,...,...,...,...,...
366985,2021,3,13,16,2021-03-13,5,Stratford,51683,Assault Offenses,Other/Unknown,99
366986,2021,9,5,17,2021-09-05,6,Stratford,51683,Assault Offenses,Other/Unknown,99
366987,2021,12,3,14,2021-12-03,4,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99
366988,2021,10,27,11,2021-10-27,2,Stratford,51683,Larceny/Theft Offenses,Other/Unknown,99
