In [5]:
import pandas as pd
from scipy.stats import zscore
import numpy as np

df = pd.read_csv("combined_data.csv")

In [6]:
columns = df.columns.tolist()
for i in columns:
    print(i)

year
date
hour
city
population
offense_category_name
criminal_act_name
location_area


In [7]:
df['offense_category_name'].describe()

count                      52712
unique                        12
top       Drug/Narcotic Offenses
freq                       21347
Name: offense_category_name, dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52712 entries, 0 to 52711
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   year                   52712 non-null  int64 
 1   date                   52712 non-null  object
 2   hour                   52712 non-null  int64 
 3   city                   52712 non-null  object
 4   population             52712 non-null  int64 
 5   offense_category_name  52712 non-null  object
 6   criminal_act_name      52712 non-null  object
 7   location_area          52712 non-null  object
dtypes: int64(3), object(5)
memory usage: 3.2+ MB


In [None]:
features_considered = ['year','hour', 'population']
z_scores = zscore(df[features_considered], nan_policy='omit')

TypeError: unsupported operand type(s) for /: 'str' and 'int'

In [None]:
z_scores_df = pd.DataFrame(z_scores, columns=features_considered)


In [None]:
z_scores_df.head()

Unnamed: 0,year,incident_hour,population
0,0.001215,0.748509,-0.813629
1,0.001215,-0.340818,-0.813629
2,0.001215,-0.1852,-0.813629
3,0.001215,0.281655,-0.813629
4,0.001215,0.281655,-0.813629


In [None]:
# convert to absolute values
abs_z_scores = np.abs(z_scores)
# filter based on |z-score| less than 3
filtered_entries = (abs_z_scores < 3).all(axis=1) & df['offense_category_name'].notna()


In [None]:
filtered_entries

0         True
1         True
2         True
3         True
4         True
         ...  
52707    False
52708    False
52709    False
52710    False
52711    False
Name: offense_category_name, Length: 52712, dtype: bool

In [None]:
print("Number of outliers:", (~filtered_entries).sum())
print(df.loc[~filtered_entries])


Number of outliers: 3365
       year incident_date  incident_hour                      city  \
16864  2022    2022-01-03              8  Connecticut State Police   
16865  2022    2022-01-03             12  Connecticut State Police   
16866  2022    2022-01-04             11  Connecticut State Police   
16867  2022    2022-01-04             11  Connecticut State Police   
16868  2022    2022-01-04             19  Connecticut State Police   
...     ...           ...            ...                       ...   
52707  2021    2021-12-30             19  Connecticut State Police   
52708  2021    2021-12-30             19  Connecticut State Police   
52709  2021    2021-12-30             20  Connecticut State Police   
52710  2021    2021-12-30             20  Connecticut State Police   
52711  2021    2021-07-02              0  Connecticut State Police   

       population   offense_category_name  \
16864      509306  Drug/Narcotic Offenses   
16865      509306  Drug/Narcotic Offenses   

In [None]:
print('Number of rows before filtering outliers', len(df))
df_without_outliers = df[filtered_entries]
print('Number of rows after filtering outliers', len(df_without_outliers))
df_without_outliers.head()

Number of rows before filtering outliers 52712
Number of rows after filtering outliers 49347


Unnamed: 0,year,incident_date,incident_hour,city,population,offense_category_name,criminal_act_name,location_area
0,2022,2022-01-02,18,Ansonia,18750,Weapon Law Violations,Using/Consuming,Residence/Home
1,2022,2022-01-12,11,Ansonia,18750,Counterfeiting/Forgery,Possessing/Concealing,Service/Gas Station
2,2022,2022-01-12,12,Ansonia,18750,Drug/Narcotic Offenses,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
3,2022,2022-01-16,15,Ansonia,18750,Weapon Law Violations,Possessing/Concealing,Government/Public Building
4,2022,2022-01-27,15,Ansonia,18750,Counterfeiting/Forgery,Cultivating/Manufacturing/Publishing,Bank/Savings and Loan


In [None]:
df.loc[z_scores_df['incident_hour'].abs() > 2]['city'].value_counts()


city
New Britain    343
Stamford       267
New Haven      260
Hartford       200
Derby          121
              ... 
Monroe           2
Wolcott          1
Cromwell         1
Avon             1
Wilton           1
Name: count, Length: 74, dtype: int64

In [None]:
df_without_outliers.drop(columns=["population"], inplace=True)
df_without_outliers.to_csv("combined_data.csv", index=False)
print()




A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_without_outliers.drop(columns=["population"], inplace=True)


In [None]:
df_without_outliers

Unnamed: 0,year,incident_date,incident_hour,city,offense_category_name,criminal_act_name,location_area
0,2022,2022-01-02,18,Ansonia,Weapon Law Violations,Using/Consuming,Residence/Home
1,2022,2022-01-12,11,Ansonia,Counterfeiting/Forgery,Possessing/Concealing,Service/Gas Station
2,2022,2022-01-12,12,Ansonia,Drug/Narcotic Offenses,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
3,2022,2022-01-16,15,Ansonia,Weapon Law Violations,Possessing/Concealing,Government/Public Building
4,2022,2022-01-27,15,Ansonia,Counterfeiting/Forgery,Cultivating/Manufacturing/Publishing,Bank/Savings and Loan
...,...,...,...,...,...,...,...
51428,2021,2021-12-04,1,Groton Town,Drug/Narcotic Offenses,Using/Consuming,Highway/Road/Alley/Street/Sidewalk
51429,2021,2021-12-07,10,Groton Town,Weapon Law Violations,Possessing/Concealing,School/College
51430,2021,2021-12-12,19,Groton Town,Drug/Narcotic Offenses,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
51431,2021,2021-12-12,19,Groton Town,Drug/Narcotic Offenses,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
