In [1]:
import pandas as pd

#Dataframes for each CSV File

df_agencies= pd.read_csv("agencies.csv") 
df_act_type = pd.read_csv("NIBRS_CRIMINAL_ACT_TYPE.csv")
df_act = pd.read_csv("NIBRS_CRIMINAL_ACT.csv")
df_incident = pd.read_csv("NIBRS_incident.csv")
df_location = pd.read_csv("NIBRS_LOCATION_TYPE.csv")
df_month = pd.read_csv("NIBRS_month.csv")
df_offense_type = pd.read_csv("NIBRS_OFFENSE_TYPE.csv")
df_offense = pd.read_csv("NIBRS_OFFENSE.csv")

df = df_incident\
    .merge(df_agencies, on="agency_id", how="inner")\
    .merge(df_month, on="did", how="inner")\
    .merge(df_offense, on="incident_id", how="inner", suffixes=("_left", "_right"))\
    .merge(df_offense_type, on="offense_code", how="inner")\
    .merge(df_act, on="offense_id", how="inner")\
    .merge(df_act_type, on="criminal_act_id", how="inner")\
    .merge(df_location, on="location_id", how = "inner")

df.to_csv("2023_merged_data.csv", index=False)

##### Removing Duplicates (If Any)

In [2]:
column_names = df.columns.tolist()
print("Number of Rows before removing duplicates: ", len(df))
df.drop_duplicates(subset=column_names, keep='first', inplace=True)
print('Number of Rows after removing Duplicates:', len(df))

Number of Rows before removing duplicates:  17401
Number of Rows after removing Duplicates: 17401


##### Removing Columns with Nulls or No Purpose

In [3]:
test = df.isna().sum()
for col, count in test.items():
    if count != 0:
        print(col, count)

cleared_except_date 17384
data_home_x 17401
covered_by_legacy_ori 17401
dormant_year 17401
ncic_agency_name 61
pub_agency_unit 17394
summary_rape_def 17401
report_date 16252
update_flag 17401
data_home_y 17401
month_pub_status 17401
num_premises_entered 17401
method_entry_code 17401


In [4]:
#These Columns had Null Entries
df.drop(columns=["cleared_except_date", 'data_home_x', 'covered_by_legacy_ori', 'dormant_year',\
                 'pub_agency_unit', 'report_date', 'update_flag', 'data_home_y', 'month_pub_status',\
                 'num_premises_entered', 'summary_rape_def', 'method_entry_code'], inplace=True)

#Manually Inspected The CSV
df.drop(columns=['nibrs_month_id_x', 'cargo_theft_flag', 'submission_date', 'report_date_flag', 'incident_id',\
                 'cleared_except_id', 'incident_status', 'orig_format_x', 'did', 'data_year_y', 'legacy_ori',\
                 'direct_contributor_flag', 'dormant_flag', 'reporting_type', 'ucr_agency_name', 'nibrs_month_id_x',\
                 'agency_status', 'state_id', 'state_abbr', 'state_postal_abbr', 'division_code', 'division_name',\
                 'region_code', 'region_name', 'region_desc', 'agency_type_name', 'submitting_agency_id', 'sai', 'ori',\
                 'submitting_agency_name', 'suburban_area_flag', 'population_group_id', 'population_group_code', 'yearly_agency_id',\
                 'population_group_desc', 'parent_pop_group_code', 'parent_pop_group_desc', 'mip_flag', 'pop_sort_order',\
                 'pe_reported_flag', 'officer_rate', 'employee_rate', 'nibrs_cert_date', 'nibrs_start_date', 'nibrs_leoka_start_date',\
                 'nibrs_ct_start_date',	'nibrs_multi_bias_start_date', 'nibrs_off_eth_start_date', 'covered_flag', 'county_name',\
                 'msa_name', 'publishable_flag', 'participated', 'nibrs_participated', 'data_year_left', 'nibrs_month_id_y',\
                 'agency_id_y',	'month_num', 'inc_data_year', 'reported_status', 'orig_format_y', 'ddocname', 'data_year_right',\
                 'offense_id', 'offense_code', 'attempt_complete_flag', 'location_id', 'ct_flag', 'hc_flag', 'hc_code', 'crime_against', \
                 'offense_group', 'data_year', 'criminal_act_id', 'criminal_act_code', 'criminal_act_desc', 'ncic_agency_name', 'location_code'],
                 inplace = True)

df.rename(columns={'data_year_x': 'year'}, inplace=True)
df.rename(columns={'agency_id_x': 'agency_location'}, inplace=True)
df.rename(columns={'pub_agency_name': 'agency_name'}, inplace=True)
df.rename(columns={'state_name': 'state'}, inplace=True)
df.rename(columns={'location_name': 'location_area'}, inplace=True)

#ncic_agency_name


##### Check Datatype for each Variable. Change Date Datatype.

In [5]:
df.info()
df['incident_date'] = pd.to_datetime(df['incident_date'], format='%Y-%m-%d', errors='coerce')
print("\n")
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17401 entries, 0 to 17400
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   year                            17401 non-null  int64 
 1   agency_location                 17401 non-null  int64 
 2   incident_date                   17401 non-null  object
 3   incident_hour                   17401 non-null  int64 
 4   agency_name                     17401 non-null  object
 5   state                           17401 non-null  object
 6   population                      17401 non-null  int64 
 7   male_officer                    17401 non-null  int64 
 8   male_civilian                   17401 non-null  int64 
 9   male_officer+male_civilian      17401 non-null  int64 
 10  female_officer                  17401 non-null  int64 
 11  female_civilian                 17401 non-null  int64 
 12  female_officer+female_civilian  17401 non-null

In [6]:
df.to_csv("2023_merged_data.csv", index=False)

In [7]:
df

Unnamed: 0,year,agency_location,incident_date,incident_hour,agency_name,state,population,male_officer,male_civilian,male_officer+male_civilian,female_officer,female_civilian,female_officer+female_civilian,offense_name,offense_category_name,criminal_act_name,location_area
0,2023,22570,2023-04-20,22,Department of Energy and Environmental Protection,Connecticut,0,46,5,51,12,5,17,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
1,2023,22570,2023-06-16,15,Department of Energy and Environmental Protection,Connecticut,0,46,5,51,12,5,17,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Field/Woods
2,2023,22570,2023-08-20,14,Department of Energy and Environmental Protection,Connecticut,0,46,5,51,12,5,17,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Park/Playground
3,2023,22570,2023-08-31,18,Department of Energy and Environmental Protection,Connecticut,0,46,5,51,12,5,17,Stolen Property Offenses,Stolen Property Offenses,Using/Consuming,Department/Discount Store
4,2023,22570,2023-11-23,11,Department of Energy and Environmental Protection,Connecticut,0,46,5,51,12,5,17,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Field/Woods
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17396,2023,2241,2023-07-18,0,Mohegan Tribal,Connecticut,0,30,5,35,2,5,7,Counterfeiting/Forgery,Counterfeiting/Forgery,Possessing/Concealing,Gambling Facility/Casino/Race Track
17397,2023,2241,2023-08-04,15,Mohegan Tribal,Connecticut,0,30,5,35,2,5,7,Counterfeiting/Forgery,Counterfeiting/Forgery,Possessing/Concealing,Gambling Facility/Casino/Race Track
17398,2023,2241,2023-08-09,22,Mohegan Tribal,Connecticut,0,30,5,35,2,5,7,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Gambling Facility/Casino/Race Track
17399,2023,2241,2023-11-03,20,Mohegan Tribal,Connecticut,0,30,5,35,2,5,7,Drug/Narcotic Violations,Drug/Narcotic Offenses,Possessing/Concealing,Hotel/Motel/Etc.
