In [31]:
import pandas as pd

#Dataframes for each CSV File

df_agencies= pd.read_csv("agencies.csv") 
df_act_type = pd.read_csv("NIBRS_CRIMINAL_ACT_TYPE.csv")
df_act = pd.read_csv("NIBRS_CRIMINAL_ACT.csv")
df_incident = pd.read_csv("NIBRS_incident.csv")
df_location = pd.read_csv("NIBRS_LOCATION_TYPE.csv")
df_month = pd.read_csv("NIBRS_month.csv")
df_offense_type = pd.read_csv("NIBRS_OFFENSE_TYPE.csv")
df_offense = pd.read_csv("NIBRS_OFFENSE.csv")

df = df_incident\
    .merge(df_agencies, on="agency_id", how="outer")\
    .merge(df_month, on=["agency_id", "nibrs_month_id", "did"], how="outer")\
    .merge(df_offense, on="incident_id", how="outer", suffixes=("_left", "_right"))\
    .merge(df_offense_type, on="offense_code", how="outer")\
    .merge(df_act, on="offense_id", how="outer")\
    .merge(df_act_type, on="criminal_act_id", how="outer")\
    .merge(df_location, on="location_id", how = "outer")

df.to_csv("2023_merged_data.csv", index=False)

##### Removing Duplicates (If Any)

In [32]:
column_names = df.columns.tolist()
print("Number of Rows before removing duplicates: ", len(df))
df.drop_duplicates(subset=column_names, keep='first', inplace=True)
print('Number of Rows after removing Duplicates:', len(df))

Number of Rows before removing duplicates:  134912
Number of Rows after removing Duplicates: 134912


##### Removing Columns with Nulls or No Purpose

In [33]:
test = df.isna().sum()
for col, count in test.items():
    if count != 0:
        print(col, count)

data_year_x 40
agency_id 39
incident_id 40
nibrs_month_id 40
cargo_theft_flag 40
submission_date 40
incident_date 40
report_date_flag 40
incident_hour 40
cleared_except_id 40
cleared_except_date 134827
incident_status 40
data_home_x 134912
orig_format_x 40
did 40
yearly_agency_id 39
data_year_y 39
ori 39
legacy_ori 39
covered_by_legacy_ori 134912
direct_contributor_flag 39
dormant_flag 39
dormant_year 134912
reporting_type 39
ucr_agency_name 39
ncic_agency_name 758
pub_agency_name 39
pub_agency_unit 134582
agency_status 39
state_id 39
state_name 39
state_abbr 39
state_postal_abbr 39
division_code 39
division_name 39
region_code 39
region_name 39
region_desc 39
agency_type_name 39
population 39
submitting_agency_id 39
sai 39
submitting_agency_name 39
suburban_area_flag 39
population_group_id 39
population_group_code 39
population_group_desc 39
parent_pop_group_code 39
parent_pop_group_desc 39
mip_flag 39
pop_sort_order 39
summary_rape_def 134912
pe_reported_flag 39
male_officer 39
male_

In [34]:
#These Columns had Null Entries
df.drop(columns=["cleared_except_date", 'data_home_x', 'covered_by_legacy_ori', 'dormant_year',\
                 'pub_agency_unit', 'report_date', 'update_flag', 'data_home_y', 'month_pub_status',\
                 'num_premises_entered', 'summary_rape_def', 'method_entry_code'], inplace=True)

#Manually Inspected The CSV
df.drop(columns=['nibrs_month_id_x', 'cargo_theft_flag', 'submission_date', 'report_date_flag', 'incident_id',\
                 'cleared_except_id', 'incident_status', 'orig_format_x', 'did', 'data_year_y', 'legacy_ori',\
                 'direct_contributor_flag', 'dormant_flag', 'reporting_type', 'ucr_agency_name', 'nibrs_month_id_x',\
                 'agency_status', 'state_id', 'state_abbr', 'state_postal_abbr', 'division_code', 'division_name',\
                 'region_code', 'region_name', 'region_desc', 'agency_type_name', 'submitting_agency_id', 'sai', 'ori',\
                 'submitting_agency_name', 'suburban_area_flag', 'population_group_id', 'population_group_code', 'yearly_agency_id',\
                 'population_group_desc', 'parent_pop_group_code', 'parent_pop_group_desc', 'mip_flag', 'pop_sort_order',\
                 'pe_reported_flag', 'officer_rate', 'employee_rate', 'nibrs_cert_date', 'nibrs_start_date', 'nibrs_leoka_start_date',\
                 'nibrs_ct_start_date',	'nibrs_multi_bias_start_date', 'nibrs_off_eth_start_date', 'covered_flag', 'county_name',\
                 'msa_name', 'publishable_flag', 'participated', 'nibrs_participated', 'data_year_left', 'nibrs_month_id_y',\
                 'agency_id_y',	'month_num', 'inc_data_year', 'reported_status', 'orig_format_y', 'ddocname', 'data_year_right',\
                 'offense_id', 'offense_code', 'attempt_complete_flag', 'location_id', 'ct_flag', 'hc_flag', 'hc_code', 'crime_against', \
                 'offense_group', 'data_year', 'criminal_act_id', 'criminal_act_code', 'criminal_act_desc', 'ncic_agency_name', 'location_code'],
                 inplace = True, errors='ignore')

df.rename(columns={'data_year_x': 'year'}, inplace=True)
df.rename(columns={'agency_id_x': 'agency_location'}, inplace=True)
df.rename(columns={'pub_agency_name': 'agency_name'}, inplace=True)
df.rename(columns={'state_name': 'state'}, inplace=True)
df.rename(columns={'location_name': 'location_area'}, inplace=True)

#ncic_agency_name


##### Check Datatype for each Variable. Change Date Datatype.

In [35]:
df.info()
df['incident_date'] = pd.to_datetime(df['incident_date'], format='%Y-%m-%d', errors='coerce')
print("\n")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134912 entries, 0 to 134911
Data columns (total 18 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   year                            134872 non-null  float64
 1   agency_id                       134873 non-null  float64
 2   nibrs_month_id                  134872 non-null  float64
 3   incident_date                   134872 non-null  object 
 4   incident_hour                   134872 non-null  float64
 5   agency_name                     134873 non-null  object 
 6   state                           134873 non-null  object 
 7   population                      134873 non-null  float64
 8   male_officer                    134873 non-null  float64
 9   male_civilian                   134873 non-null  float64
 10  male_officer+male_civilian      134873 non-null  float64
 11  female_officer                  134873 non-null  float64
 12  female_civilian 

In [None]:
df.drop(columns=['nibrs_month_id', 'nibrs_month_id', 'state', 'criminal_act_name'], inplace=True)
df.dropna(inplace=True)
df['population'] = df['population'].astype(int)
df['year'] = df['year'].astype(int)
df['incident_hour'] = df['incident_hour'].astype(int)



IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
df.to_csv("2023_merged_data.csv", index=False)

In [None]:
df.isna().sum()


year                              0
agency_id                         0
incident_date                     0
incident_hour                     0
agency_name                       0
population                        0
male_officer                      0
male_civilian                     0
male_officer+male_civilian        0
female_officer                    0
female_civilian                   0
female_officer+female_civilian    0
offense_name                      0
offense_category_name             0
location_area                     0
dtype: int64

In [None]:
df

Unnamed: 0,year,agency_id,incident_date,incident_hour,agency_name,population,male_officer,male_civilian,male_officer+male_civilian,female_officer,female_civilian,female_officer+female_civilian,offense_name,offense_category_name,location_area
0,2023,2239.0,2023-07-04,19,Connecticut State Police,464092,792.0,223.0,1015.0,122.0,262.0,384.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
1,2023,2239.0,2023-07-04,19,Connecticut State Police,464092,792.0,223.0,1015.0,122.0,262.0,384.0,Weapon Law Violations,Weapon Law Violations,Abandoned/Condemned Structure
2,2023,2166.0,2023-08-17,17,New Britain,74609,140.0,1.0,141.0,18.0,7.0,25.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
3,2023,2166.0,2023-08-17,17,New Britain,74609,140.0,1.0,141.0,18.0,7.0,25.0,Drug Equipment Violations,Drug/Narcotic Offenses,Abandoned/Condemned Structure
4,2023,2195.0,2023-09-11,9,Stamford,136512,240.0,11.0,251.0,31.0,13.0,44.0,Counterfeiting/Forgery,Counterfeiting/Forgery,Abandoned/Condemned Structure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134867,2023,2128.0,2023-12-23,22,Bristol,61609,106.0,11.0,117.0,11.0,14.0,25.0,Motor Vehicle Theft,Motor Vehicle Theft,Other/Unknown
134868,2023,2239.0,2023-09-27,2,Connecticut State Police,464092,792.0,223.0,1015.0,122.0,262.0,384.0,Murder and Nonnegligent Manslaughter,Homicide Offenses,Other/Unknown
134869,2023,2239.0,2023-10-21,13,Connecticut State Police,464092,792.0,223.0,1015.0,122.0,262.0,384.0,Identity Theft,Fraud Offenses,Other/Unknown
134870,2023,2215.0,2023-10-19,10,Windsor,29469,41.0,3.0,44.0,8.0,7.0,15.0,All Other Larceny,Larceny/Theft Offenses,Other/Unknown
