# -------------------------------------------------------------------------
# Project: Los Angeles Crime Analysis
# Author: Malak Khouja
# Description: Exploratory analysis of Los Angeles crime dataset
# -------------------------------------------------------------------------

# 1. Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Load dataset

In [None]:
crimes = pd.read_csv("crimes.csv", dtype={"TIME OCC": str})

Unnamed: 0,DR_NO,Date Rptd,DATE OCC,TIME OCC,AREA NAME,Crm Cd Desc,Vict Age,Vict Sex,Vict Descent,Weapon Desc,Status Desc,LOCATION
0,220314085,2022-07-22,2020-05-12,1110,Southwest,THEFT OF IDENTITY,27,F,B,,Invest Cont,2500 S SYCAMORE AV
1,222013040,2022-08-06,2020-06-04,1620,Olympic,THEFT OF IDENTITY,60,M,H,,Invest Cont,3300 SAN MARINO ST
2,220614831,2022-08-18,2020-08-17,1200,Hollywood,THEFT OF IDENTITY,28,M,H,,Invest Cont,1900 TRANSIENT
3,231207725,2023-02-27,2020-01-27,635,77th Street,THEFT OF IDENTITY,37,M,H,,Invest Cont,6200 4TH AV
4,220213256,2022-07-14,2020-07-14,900,Rampart,THEFT OF IDENTITY,79,M,B,,Invest Cont,1200 W 7TH ST


# 3. Overview of data

In [None]:
crimes.head()

Dataset structure

In [250]:
crimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185715 entries, 0 to 185714
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   DR_NO         185715 non-null  int64 
 1   Date Rptd     185715 non-null  object
 2   DATE OCC      185715 non-null  object
 3   TIME OCC      185715 non-null  object
 4   AREA NAME     185715 non-null  object
 5   Crm Cd Desc   185715 non-null  object
 6   Vict Age      185715 non-null  int64 
 7   Vict Sex      185704 non-null  object
 8   Vict Descent  185705 non-null  object
 9   Weapon Desc   73502 non-null   object
 10  Status Desc   185715 non-null  object
 11  LOCATION      185715 non-null  object
dtypes: int64(2), object(10)
memory usage: 17.0+ MB


Summary statistics

In [None]:
crimes.describe()

Check missing values

In [None]:
crimes.isna().sum()

# 4. Data cleaning

Convert coloumns type

In [None]:
crimes["Date Rptd"] = pd.to_datetime(crimes["Date Rptd"])
crimes["DATE OCC"] = pd.to_datetime(crimes["DATE OCC"])
crimes["TIME OCC"] = crimes["TIME OCC"].astype("int")

: 

In [None]:
print(crimes["Weapon Desc"].unique())

Fix inconsistent values in Vict Sex

In [None]:
crimes["Vict Sex"].value_counts()

In [None]:
crimes["Vict Sex"] = crimes["Vict Sex"].str.replace("H","M")

In [None]:
crimes["Vict Sex"].value_counts()

# 5. Feature engineering

Create hour columns

In [None]:
crimes["TIME OCC"] = crimes["TIME OCC"].astype(str).str.zfill(4)
crimes["HEURE"] = crimes["TIME OCC"].str[:2]
crimes["HEURE"] = crimes["HEURE"].astype("int")

Create age groups for victims

In [None]:
bins = [0, 17, 25, 34, 44, 54, 64, np.inf]
labels = ["0-17", "18-25", "26-34", "35-44", "45-54", "55-64", "65+"]
crimes["Age Group"] = pd.cut(crimes["Vict Age"], bins=bins, labels=labels, right=True, include_lowest=True)

# 6. Statistical analysis

outlier detection

In [None]:
# Detect outliers in victim age using IQR method
seventy_fifth = crimes["Vict Age"].quantile(0.75)
twenty_fifth = crimes["Vict Age"].quantile(0.25)
iqr = seventy_fifth - twenty_fifth
upper = seventy_fifth + (1.5*iqr)
lower = twenty_fifth - (1.5*iqr)
no_outliers = crimes[(crimes["Vict Age"] > lower) & (crimes["Vict Age"] < upper)]

# 7. Exploratory Data Analysis (EDA)

Number of crimes by area

In [None]:
g=sns.countplot(x="AREA NAME", data=crimes)
plt.xticks(rotation=90)
g.set(xlabel="Area Name", ylabel="Count of Crimes")
plt.title("Count of Crimes in each Area")
plt.show()

In [None]:
crimes.describe()

Crime distribution by hour

In [None]:
sns.countplot(y="HEURE",data=crimes)

In [None]:
peak_crime_hour=12

Night crimes analysis (10pm - 3:59am)

In [None]:
crimes_nuit = crimes[(crimes["HEURE"] >= 22) | (crimes["HEURE"] < 4)]
crimes_nuit.head()

In [None]:
sns.countplot(data=crimes_nuit,x="AREA NAME", palette="Purples")
plt.xticks(rotation=90)
g.set(xlabel="Area Name", ylabel="Count of Night Crimes")
plt.title("Count of Night Crimes in each Area")
plt.show()

In [None]:
# Area with the most night crimes
peak_night_crime_location = "Central"
print(f"The area with the largest volume of night crime is {peak_night_crime_location}")

Victim age group distribution

In [None]:
victim_ages = crimes["Age Group"].value_counts().sort_index()
victim_ages