In [17]:
import pandas as pd

#Dataframes for each CSV File

df_agencies= pd.read_csv("agencies.csv") 
df_act_type = pd.read_csv("NIBRS_CRIMINAL_ACT_TYPE.csv")
df_act = pd.read_csv("NIBRS_CRIMINAL_ACT.csv")
df_incident = pd.read_csv("NIBRS_incident.csv")
df_location = pd.read_csv("NIBRS_LOCATION_TYPE.csv")
df_month = pd.read_csv("NIBRS_month.csv")
df_offense_type = pd.read_csv("NIBRS_OFFENSE_TYPE.csv")
df_offense = pd.read_csv("NIBRS_OFFENSE.csv")

df = df_incident\
    .merge(df_agencies, on="agency_id", how="inner")\
    .merge(df_month, on="did", how="inner")\
    .merge(df_offense, on="incident_id", how="inner", suffixes=("_left", "_right"))\
    .merge(df_offense_type, on="offense_code", how="inner")\
    .merge(df_act, on="offense_id", how="inner")\
    .merge(df_act_type, on="criminal_act_id", how="inner")\
    .merge(df_location, on="location_id", how = "inner")

df.to_csv("2022_merged_data.csv", index=False)

##### Removing Duplicates (If Any)

In [18]:
column_names = df.columns.tolist()
print("Number of Rows before removing duplicates: ", len(df))
df.drop_duplicates(subset=column_names, keep='first', inplace=True)
print('Number of Rows after removing Duplicates:', len(df))

Number of Rows before removing duplicates:  17997
Number of Rows after removing Duplicates: 17997


##### Removing Columns with Nulls or No Purpose

In [19]:
test = df.isna().sum()
for col, count in test.items():
    if count != 0:
        print(col, count)

cleared_except_date 17979
data_home_x 17997
covered_by_legacy_ori 17997
dormant_year 17997
ncic_agency_name 33
pub_agency_unit 17994
summary_rape_def 17997
male_officer 10
male_civilian 10
male_officer+male_civilian 10
female_officer 10
female_civilian 10
female_officer+female_civilian 10
report_date 16782
update_flag 17997
data_home_y 17997
month_pub_status 17997
num_premises_entered 17997
method_entry_code 17997


In [20]:
#These Columns had Null Entries
df.drop(columns=["cleared_except_date", 'data_home_x', 'covered_by_legacy_ori', 'dormant_year',\
                 'pub_agency_unit', 'report_date', 'update_flag', 'data_home_y', 'month_pub_status',\
                 'num_premises_entered', 'summary_rape_def', 'method_entry_code'], inplace=True)

#Manually Inspected The CSV
df.drop(columns=['nibrs_month_id_x', 'cargo_theft_flag', 'submission_date', 'report_date_flag', 'incident_id',
                 'cleared_except_id', 'incident_status', 'orig_format_x', 'did', 'data_year_y', 'legacy_ori',
                 'direct_contributor_flag', 'dormant_flag', 'reporting_type', 'ucr_agency_name', 'nibrs_month_id_x',
                 'agency_status', 'state_id', 'state_abbr', 'state_postal_abbr', 'division_code', 'division_name',
                 'region_code', 'region_name', 'region_desc', 'agency_type_name', 'submitting_agency_id', 'sai', 'ori',
                 'submitting_agency_name', 'suburban_area_flag', 'population_group_id', 'population_group_code', 'yearly_agency_id',
                 'population_group_desc', 'parent_pop_group_code', 'parent_pop_group_desc', 'mip_flag', 'pop_sort_order',
                 'pe_reported_flag', 'officer_rate', 'employee_rate', 'nibrs_cert_date', 'nibrs_start_date', 'nibrs_leoka_start_date',
                 'nibrs_ct_start_date',	'nibrs_multi_bias_start_date', 'nibrs_off_eth_start_date', 'covered_flag', 'county_name',
                 'msa_name', 'publishable_flag', 'participated', 'nibrs_participated', 'data_year_left', 'nibrs_month_id_y',
                 'agency_id_y',	'month_num', 'inc_data_year', 'reported_status', 'orig_format_y', 'ddocname', 'data_year_right',
                 'offense_id', 'offense_code', 'attempt_complete_flag', 'location_id', 'ct_flag', 'hc_flag', 'hc_code', 'crime_against', 
                 'offense_group', 'data_year', 'criminal_act_id', 'criminal_act_code', 'criminal_act_desc', 'ncic_agency_name', 'location_code'],
                 inplace = True)

df.rename(columns={'data_year_x': 'year'}, inplace=True)
df.rename(columns={'agency_id_x': 'agency_location'}, inplace=True)
df.rename(columns={'pub_agency_name': 'agency_name'}, inplace=True)
df.rename(columns={'state_name': 'state'}, inplace=True)
df.rename(columns={'location_name': 'location_area'}, inplace=True)

#ncic_agency_name


In [21]:
test = df.isna().sum()
for col, count in test.items():
    if count != 0:
        print(col, count)

male_officer 10
male_civilian 10
male_officer+male_civilian 10
female_officer 10
female_civilian 10
female_officer+female_civilian 10


In [22]:
df.dropna(subset=[
    'male_officer','male_civilian','male_officer+male_civilian','female_officer',
    'female_civilian', 'female_officer+female_civilian'
], inplace=True)


##### Check Datatype for each Variable. Change Date Datatype.

In [23]:
df.info()
df['incident_date'] = pd.to_datetime(df['incident_date'], format='%Y-%m-%d', errors='coerce')
print("\n")
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 17987 entries, 0 to 17996
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   year                            17987 non-null  int64  
 1   agency_location                 17987 non-null  int64  
 2   incident_date                   17987 non-null  object 
 3   incident_hour                   17987 non-null  int64  
 4   agency_name                     17987 non-null  object 
 5   state                           17987 non-null  object 
 6   population                      17987 non-null  int64  
 7   male_officer                    17987 non-null  float64
 8   male_civilian                   17987 non-null  float64
 9   male_officer+male_civilian      17987 non-null  float64
 10  female_officer                  17987 non-null  float64
 11  female_civilian                 17987 non-null  float64
 12  female_officer+female_civilian  17987

In [24]:
df.to_csv("2022_merged_data.csv", index=False)

In [27]:
df

Unnamed: 0,year,agency_location,incident_date,incident_hour,agency_name,state,population,male_officer,male_civilian,male_officer+male_civilian,female_officer,female_civilian,female_officer+female_civilian,offense_name,offense_category_name,criminal_act_name,location_area
0,2022,2120,2022-01-02,18,Ansonia,Connecticut,18750,29.0,1.0,30.0,5.0,9.0,14.0,Weapon Law Violations,Weapon Law Violations,Using/Consuming,Residence/Home
1,2022,2120,2022-01-12,11,Ansonia,Connecticut,18750,29.0,1.0,30.0,5.0,9.0,14.0,Counterfeiting/Forgery,Counterfeiting/Forgery,Possessing/Concealing,Service/Gas Station
2,2022,2120,2022-01-12,12,Ansonia,Connecticut,18750,29.0,1.0,30.0,5.0,9.0,14.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Possessing/Concealing,Highway/Road/Alley/Street/Sidewalk
3,2022,2120,2022-01-16,15,Ansonia,Connecticut,18750,29.0,1.0,30.0,5.0,9.0,14.0,Weapon Law Violations,Weapon Law Violations,Possessing/Concealing,Government/Public Building
4,2022,2120,2022-01-27,15,Ansonia,Connecticut,18750,29.0,1.0,30.0,5.0,9.0,14.0,Counterfeiting/Forgery,Counterfeiting/Forgery,Cultivating/Manufacturing/Publishing,Bank/Savings and Loan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17992,2022,2241,2022-07-28,3,Mohegan Tribal,Connecticut,0,27.0,4.0,31.0,2.0,4.0,6.0,Counterfeiting/Forgery,Counterfeiting/Forgery,Possessing/Concealing,Gambling Facility/Casino/Race Track
17993,2022,2241,2022-11-17,4,Mohegan Tribal,Connecticut,0,27.0,4.0,31.0,2.0,4.0,6.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Possessing/Concealing,Gambling Facility/Casino/Race Track
17994,2022,2241,2022-11-21,13,Mohegan Tribal,Connecticut,0,27.0,4.0,31.0,2.0,4.0,6.0,Counterfeiting/Forgery,Counterfeiting/Forgery,Possessing/Concealing,Other/Unknown
17995,2022,2241,2022-12-19,18,Mohegan Tribal,Connecticut,0,27.0,4.0,31.0,2.0,4.0,6.0,Drug/Narcotic Violations,Drug/Narcotic Offenses,Possessing/Concealing,Gambling Facility/Casino/Race Track
