In [59]:
import pandas as pd
import numpy as np

# **Data Cleaning**

### Shows the first five rows of the data

In [60]:
df = pd.read_csv("Crime_Data_from_2020_to_Present.csv")
df.head()

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA,AREA NAME,Rpt Dist No,Part 1-2,Crm Cd,Crm Cd Desc,...,Status,Status Desc,Crm Cd 1,Crm Cd 2,Crm Cd 3,Crm Cd 4,LOCATION,Cross Street,LAT,LON
0,190326475,03/01/2020 12:00:00 AM,03/01/2020 12:00:00 AM,2130,7,Wilshire,784,1,510,VEHICLE - STOLEN,...,AA,Adult Arrest,510.0,998.0,,,1900 S LONGWOOD AV,,34.0375,-118.3506
1,200106753,02/09/2020 12:00:00 AM,02/08/2020 12:00:00 AM,1800,1,Central,182,1,330,BURGLARY FROM VEHICLE,...,IC,Invest Cont,330.0,998.0,,,1000 S FLOWER ST,,34.0444,-118.2628
2,200320258,11/11/2020 12:00:00 AM,11/04/2020 12:00:00 AM,1700,3,Southwest,356,1,480,BIKE - STOLEN,...,IC,Invest Cont,480.0,,,,1400 W 37TH ST,,34.021,-118.3002
3,200907217,05/10/2023 12:00:00 AM,03/10/2020 12:00:00 AM,2037,9,Van Nuys,964,1,343,SHOPLIFTING-GRAND THEFT ($950.01 & OVER),...,IC,Invest Cont,343.0,,,,14000 RIVERSIDE DR,,34.1576,-118.4387
4,220614831,08/18/2022 12:00:00 AM,08/17/2020 12:00:00 AM,1200,6,Hollywood,666,2,354,THEFT OF IDENTITY,...,IC,Invest Cont,354.0,,,,1900 TRANSIENT,,34.0944,-118.3277


#### Shows the list of column names and the number of rows and columns (982638 rows & 28 columns)

##### Column names and their meaning (for future reference)

- **DR_NO**: Unique identifier for each reported crime.
- **Date Rptd**: Date the crime was reported to law enforcement.
- **DATE OCC**: Date the crime occurred.
- **TIME OCC**: Time the crime occurred.
- **AREA**: Geographic area or precinct where the crime took place.
- **AREA NAME**: Descriptive name of the area.
- **Rpt Dist No**: Reporting district number.
- **Part 1-2**: Indicates whether the crime is a Part 1 (serious) or Part 2 (less serious) offense.
- **Crm Cd**: Crime code or classification number.
- **Crm Cd Desc**: Description of the crime code.
- **Mocodes**: Motivations or circumstances related to the crime.
- **Vict Age**: Age of the victim.
- **Vict Sex**: Sex of the victim.
- **Vict Descent**: Racial or ethnic background of the victim.
- **Premis Cd**: Premises code (e.g., residential, commercial).
- **Premis Desc**: Description of the premises.
- **Weapon Used Cd**: Code for the weapon used (if any).
- **Weapon Desc**: Description of the weapon.
- **Status**: Current status of the case (e.g., open, closed).
- **Status Desc**: Description of the case status.
- **Crm Cd 1, 2, 3, 4**: Additional crime codes if applicable.
- **LOCATION**: General location of the crime.
- **Cross Street**: Intersection or nearby street.
- **LAT, LON**: Latitude and longitude coordinates of the crime location

In [61]:
print(df.columns)
print(df.shape)

Index(['DR_NO', 'Date Rptd', 'DATE OCC', 'TIME OCC', 'AREA', 'AREA NAME',
       'Rpt Dist No', 'Part 1-2', 'Crm Cd', 'Crm Cd Desc', 'Mocodes',
       'Vict Age', 'Vict Sex', 'Vict Descent', 'Premis Cd', 'Premis Desc',
       'Weapon Used Cd', 'Weapon Desc', 'Status', 'Status Desc', 'Crm Cd 1',
       'Crm Cd 2', 'Crm Cd 3', 'Crm Cd 4', 'LOCATION', 'Cross Street', 'LAT',
       'LON'],
      dtype='object')
(982638, 28)


##### Shows the amount of null, unique and total values for each columns

In [62]:
def summary(df):
  summ = pd.DataFrame(df.dtypes, columns=['Data Type'])
  summ['Null values'] = df.isna().sum()
  summ['Unique values'] = df.nunique().values
  summ['Total values'] = df.count().values
  display(summ)
summary(df)

Unnamed: 0,Data Type,Null values,Unique values,Total values
DR_NO,int64,0,982638,982638
Date Rptd,object,0,1735,982638
DATE OCC,object,0,1735,982638
TIME OCC,int64,0,1439,982638
AREA,int64,0,21,982638
AREA NAME,object,0,21,982638
Rpt Dist No,int64,0,1209,982638
Part 1-2,int64,0,2,982638
Crm Cd,int64,0,140,982638
Crm Cd Desc,object,0,140,982638


##### The column, *Mocodes*, contain data values that are a string of space-separated codes. To refine this data, we turned them into a list of strings

In [63]:
print("Before: ", df.loc[1, 'Mocodes']) # Sample row from the 'Mocodes' column. It's a string of space-separated code

# Goes through each row in the 'Mocodes' column and transforms a string of space-separated codes into a list of string codes. If the row is empty, create an empty list instead.
df['Mocodes'] = df['Mocodes'].apply(lambda x: x.split() if pd.notnull(x) else [])

print("After: ", df.loc[1, 'Mocodes']) # After refining, it's now a list of strings

Before:  1822 1402 0344
After:  ['1822', '1402', '0344']
