<h1> MIDTERM HANDS ON EXAM - Chicago Crimes Data Analytics</h1>
<hr>
<h3>Analyst: John Oliver A. Liwanag</h3>

In [None]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
warnings.filterwarnings('ignore')

In [None]:
chi_crimes = pd.read_csv('datasets\\Chicago_Crimes.csv')
chi_crimes

In [None]:
chi_crimes.columns

<h1>Fill Null Values</h1>

In [None]:
chi_crimes.isnull().sum()

In [None]:
chi_crimes['Date'] = chi_crimes['Date'].astype(str)
chi_crimes['Date'] = chi_crimes['Date'].replace('/', '-') 
chi_crimes['Date'] = chi_crimes['Date'].str.strip()
chi_crimes['Date'] = pd.to_datetime(chi_crimes['Date'], dayfirst=True, errors='coerce')

In [None]:
chi_crimes['Location Description'] = chi_crimes['Location Description'].fillna('unaccounted')
chi_crimes['Community Area'] = chi_crimes['Community Area'].fillna('unaccounted')
chi_crimes['X Coordinate'] = chi_crimes['X Coordinate'].fillna('unaccounted')
chi_crimes['Y Coordinate'] = chi_crimes['Y Coordinate'].fillna('unaccounted')
chi_crimes['Latitude'] = chi_crimes['Latitude'].fillna(chi_crimes['Latitude'].mode()[0])
chi_crimes['Longitude'] = chi_crimes['Longitude'].fillna(chi_crimes['Latitude'].mode()[0])
chi_crimes['Location'] = chi_crimes['Location'].fillna('unaccounted')
chi_crimes.isnull().sum()

<h1>Extracting Date Information</h1>

In [None]:
chi_crimes['Date'] = pd.to_datetime(chi_crimes['Date'], dayfirst = True, errors = 'coerce')

In [None]:
chi_crimes['Month'] = chi_crimes['Date'].dt.month
chi_crimes['Day'] = chi_crimes['Date'].dt.day
chi_crimes['DayOfWeek'] = chi_crimes['Date'].dt.dayofweek
chi_crimes['Time'] = chi_crimes['Date'].dt.hour

chi_crimes

<h1>Change Data Types</h1>

In [None]:
chi_crimes['Block'] = chi_crimes['Block'].astype('category')
chi_crimes['IUCR'] = chi_crimes['IUCR'].astype('category')
chi_crimes['Primary Type'] = chi_crimes['Primary Type'].astype('category')
chi_crimes['Description'] = chi_crimes['Description'].astype('category')
chi_crimes['Location Description'] = chi_crimes['Location Description'].astype('category')
chi_crimes['District'] = chi_crimes['District'].astype('category')
chi_crimes['Community Area'] = chi_crimes['Community Area'].astype('category')
chi_crimes['FBI Code'] = chi_crimes['FBI Code'].astype('category')

chi_crimes.dtypes

<hr>
<h1>Questions:</h1>
<h2>1. How many crimes are recorded for each year?</h2>

In [None]:
per_year = chi_crimes['Year'].value_counts()
per_year

In [None]:
per_year.plot(kind='bar')
plt.title('Recorded Crimes in Chicago (2024-2025)')
plt.xlabel('Year')
plt.ylabel('Number of Crimes')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol style='font-size: 19px'>
    <li>There are 188,918 crimes recorded in 2024 and 60,205 on 2025 with a total of 249,123 records across all the data.</li>
    <li>There is a big gap between the first and second year as the amount of records experience a 68.14% drop from 2024-2025</li>
</ol>
<hr>
<h2>2. Which year have the higher arrest rate?</h2>

In [None]:
arrest_rate = chi_crimes.groupby(['Year', 'Arrest']).size()
arrest_rate

In [None]:
arrest_2024 = chi_crimes[chi_crimes['Year'] == 2024]
arrest_2025 = chi_crimes[chi_crimes['Year'] == 2025]

#for the year 2024
ar_2024 = arrest_2024['Arrest'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['cornflowerblue', 'orange'], startangle = 110)
plt.title('Arrests in 2024')
plt.ylabel('')
plt.show()

#for the year 2025
ar_2025 = arrest_2025['Arrest'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['cornflowerblue', 'orange'], startangle = 120)
plt.title('Arrests in 2025')
plt.ylabel('')
plt.show()

<h1>INSIGHTS:</h1>
<ol start='3', style='font-size: 19px'>
    <li>2025 have the higher arrest rate being 4.1% more than the previous year.</li>
    <li>For both years, most crimes usually don't result to an arrest as the remaining is still over 80% for both of them.</li>
</ol>
<hr>
<h2>3. What are the most common types of crime?</h2>

In [None]:
crimes = chi_crimes['Primary Type'].value_counts().head(6)
crimes

In [None]:
crimes.plot(kind='bar', figsize=(13, 6))
plt.title('Most Common Types of Crime')
plt.xlabel('Type of Crime')
plt.ylabel('Number of Records')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='5', style='font-size: 19px'>
    <li>The most common type of crimes in Chicago is Theft, Battery, Criminal Damage, Assault, Motor Vehicle Theft and Other Offense.</li>
    <li>Theft is the most common from all the types with 59,201 being recorded across all the data.</li>
    <li>Theft, including Motor Vehicle Theft already cover nearly 31.6% of all reported crimes, making the most common type in the dataset.</li>
</ol>
<hr>
<h2>4. What are the common locations where crimes occur most?</h2>

In [None]:
common_loc = chi_crimes['Location Description'].value_counts().head(5)
common_loc

In [None]:
common_loc.plot(kind='bar', figsize=(13, 6))
plt.title('Most Common Locations for Crimes')
plt.xlabel('Type of Crime')
plt.ylabel('Number of Records')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='8', style='font-size: 19px'>
    <li>The most common locations for crimes in Chicago is in the Street, Apartment, Residence, Sidewalk and Small Retail Stores.</li>
    <li>Crimes usually happen on the street as there have been 66,040 recorded for the past two years.</li>
    <li>Public spaces like streets and sidewalks are common as they're open and accessible, making them easier targets for crimes like theft or robbery.</li>
</ol>
<hr>
<h2>5. In what hour of the day does most crimes usually happen?</h2>

In [None]:
hour = chi_crimes.groupby(['Time']).size()
hour

In [None]:
hour.plot(marker='o', figsize=(10, 5))
plt.title('Number of Crimes Recorded Each Hour')
plt.grid(alpha=0.3)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='11', style='font-size: 19px'>
    <li>Most of the crimes happen in nighttime as the count don't even go above 12,000 before 12:00PM (noon).</li>
    <li>12:00 AM (midnight) have the highest recorded crimes having 16,752 recorded that hour. </li>
    <li>5:00 is the hour with the least amount of records with only 4,551 being recorded in that hour for the past two years.</li>
</ol>
<hr>
<h2>6. Do robberies become more common in certain months in the year?</h2>

In [None]:
rob = chi_crimes[chi_crimes['Primary Type'] == 'ROBBERY']

rob_df = rob.groupby(['Year', 'Month']).size().unstack(level=0)
rob_df

In [None]:
rob_df.plot(
    marker='o', 
    figsize=(10, 5),
    title='Robberies Reported Each Month'
)
plt.xticks(range(1, 13), ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'])
plt.grid(alpha=0.3)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='14', style='font-size: 19px'>
    <li>In 2024, Robberies experience a 26.5% increase in the month of April and it just continue to rise over the months.</li>
    <li>In 2025, the amount of it experience a 35.3% drop going from 357 from the month of March and down to only 68 in April.</li>
    <li>For both years, they both experience a big change after the month of March</li>
</ol>
<hr>
<h2>7. What type of crimes have the least amount of records and are most of them arrested?</h2>

In [None]:
arrest_type = chi_crimes.groupby(['Primary Type', 'Arrest']).size().unstack()
arrest_type

In [None]:
low_arrest = chi_crimes[chi_crimes['Primary Type'].isin(['GAMBLING', 'HUMAN TRAFFICKING', 'NON-CRIMINAL', 'OBSCENITY', 'PUBLIC INDECENCY'])]
low_arrest['Primary Type'] = low_arrest['Primary Type'].astype('string')

arrest_type = low_arrest.groupby(['Primary Type', 'Arrest']).size().unstack()

In [None]:
arrest_type.plot(kind='bar', figsize=(10,6))
plt.title('Least Common Crimes with Arrest Outcomes')
plt.xlabel('Crime Type')
plt.ylabel('Number of Records')
plt.legend(title='Arrested')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='17', style='font-size: 19px'>
    <li>The crimes with the least amount of records are Gambling, Human Trafficking, Non-Criminal, Obscenity and Public Indecency.</li>
    <li>Non-Criminal have the least records for all of the types with a combined count of only 4.</li>
    <li>For Gambling, it has a 100% arrest rate with 18 records which all of them resulted in an arrest.</li>
    <li>It is then the opposite for Human Trafficking being 0% making it with no arrests despite having 24 records.</li>
</ol>
<hr>
<h2>8. Are the crimes in the most common locations domestic or non-domestic?</h2>

In [None]:
dom_loc = chi_crimes[chi_crimes['Location Description'].isin(['STREET', 'APARTMENT', 'RESIDENCE', 'SIDEWALK', 'SMALL RETAIL STORE'])]
dom_loc['Location Description'] = dom_loc['Location Description'].astype('string')

arresto_d = dom_loc.groupby(['Location Description', 'Domestic']).size().unstack()
arresto_d

In [None]:
arresto_d.plot(kind='bar', figsize=(10,6))
plt.title('Are the crimes in the most common locations domestic or non-domestic?')
plt.xlabel('Crime Type')
plt.ylabel('Number of Records')
plt.legend(title='Arrested')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='21', style='font-size: 19px'>
    <li>Most of the crimes happening in the locations are non-domestic.</li>
    <li>Apartments experience the most domestic crimes having 22,370 recorded having the least difference from its non-domestic crimes in the area.</li>
    <li>Apartments and Residence have the most domestic crimes as this is where people live, so it naturally occur in these settings.</li>
</ol>
<hr>
<h2>9. What are the most common descriptions for the crimes in Chicago?</h2>

In [None]:
common_desc = chi_crimes['Description'].value_counts().head(6)
common_desc

In [None]:
plt.figure(figsize=(11,6))
sns.countplot(y='Description', data=chi_crimes, order=common_desc.index)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='24', style='font-size: 19px'>
    <li>'Simple' is the most common description for crimes with about 30,526 records given this description.</li>
    <li>This is followed by 'Domestic Battery Simple' with about 19,810 records.</li>
</ol>
<hr>
<h2>10. Is there any difference to the number of crimes for weekdays and weekends? </h2>

In [None]:
whichdays = chi_crimes.groupby(['DayOfWeek']).size()
whichdays

In [None]:
whichdays.plot(kind='bar', figsize=(13, 6))
plt.title('Daily Crime Records Throughout the Week')
plt.xlabel('Day of Week')
plt.ylabel('Number of Records')
plt.xticks(range(0, 7), ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'], rotation=0)

plt.show()

<h1>INSIGHTS:</h1>
<ol start='26', style='font-size: 19px'>
    <li>There are not much difference between all of the days in the week as all of them stay over 35,000.</li>
    <li>Throughout the week, Friday have the highest amount of records with about 37,498 crimes reported that day of week.</li>
    <li>This is followed by Monday with 36,262 records.</li>
</ol>
<hr>
<h2>11. Is there more domestic crimes in 2024 or 2025? </h2>

In [None]:
arrest_2024 = chi_crimes[chi_crimes['Year'] == 2024]
arrest_2025 = chi_crimes[chi_crimes['Year'] == 2025]

#for the year 2024
ar_2024 = arrest_2024['Domestic'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['cornflowerblue', 'orange'], startangle = 110)
plt.title('Domestic VS Non-Domestic (2024)')
plt.ylabel('')
plt.show()

#for the year 2025
ar_2025 = arrest_2025['Domestic'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['cornflowerblue', 'orange'], startangle = 120)
plt.title('Domestic VS Non-Domestic (2025)')
plt.ylabel('')
plt.show()

<h1>INSIGHTS:</h1>
<ol start='29', style='font-size: 19px'>
    <li>2024 and 2025 are almost the same only having a 0.8% difference.</li>
    <li>For both years, there are more non-domestic crimes than domestic as the remaining are still above 80% of all the records.</li>
    <li>Domestic actually experience an increase in percentage for the year 2025 despite that year having the less amount of crimes recorded.</li>
</ol>
<hr>
<h2>12. What FBI Code have the highest record for 2024?</h2>

In [None]:
code = chi_crimes['FBI Code'].value_counts().head(10)
code

In [None]:
code.plot(kind='barh', figsize=(13, 6))
plt.title('Most Common FBI Codes Recorded in Chicago')
plt.xlabel('Type of Crime')
plt.ylabel('Number of Records')
plt.xticks(rotation=0)
plt.show()

<h1>INSIGHTS:</h1>
<ol start='32', style='font-size: 19px'>
    <li>'06' is the most common FBI Code with about 60,380 records that come with that code.</li>
    <li>This is followed by '08B' with about 37,090 records.</li>
</ol>
<hr>
<h2>13. Which crimes are most common in the most active blocks?</h2>

In [None]:
active_blocks = chi_crimes[chi_crimes['Block'].isin(['001XX N STATE ST', '0000X N STATE ST', '0000X W TERMINAL ST', '044XX N BROADWAY', '076XX S CICERO AVE', '011XX S CANAL ST'])]
active_blocks['Block'] = active_blocks['Block'].astype('string')

crime_count = active_blocks.groupby(['Block', 'Primary Type']).size().unstack(level=0)
crime_count

In [None]:
active_blocks['Primary Type'] = active_blocks['Primary Type'].astype('string')
active = active_blocks.groupby(['Block', 'Primary Type']).size().unstack()

active.plot(kind='bar', stacked=True, figsize=(14,8))
plt.title('Crimes in the Most Active Blocks')
plt.xlabel('Block')
plt.ylabel('Number of Records')
plt.legend(title='Arrested')
plt.xticks(rotation=0)
plt.legend(title='Primary Type', bbox_to_anchor = (1,1), loc='upper left')
plt.show()

<h1>INSIGHTS:</h1>
<ol start='34', style='font-size: 19px'>
    <li>All the blocks experience the same having theft as their most common crime recorded.</li>
    <li>001XX N STATE ST is the most active block having the highest record of theft and highest combined number with over 600 happening in that block alone.</li>
</ol>
<hr>