In [None]:
# Load the dataset into Python environment
import pandas as pd
df = pd.read_csv("Crimes_-_2023.csv")
print(df.head(5))

In [None]:
# Let's see some basic information about the dataset.
print(df.info())

In [None]:
# This method shows the number of rows, columns in a dataset
print(df.shape)

In [None]:
# This method generates descriptive statistics for numerical columns in the DataFrame, such as:
# count, mean, standard deviation, minimum, maximum, and quartiles. 
# It's helpful for understanding the distribution of numerical data.
print(df.describe())

In [None]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt

# Connect to the SQLite database
conn = sqlite3.connect('crime_data.db')

# Crime Type Analysis
crime_type_query = """
    SELECT "Primary Type", COUNT(*) AS "Crime Count"
    FROM crimes
    GROUP BY "Primary Type"
    ORDER BY "Crime Count" DESC
"""
crime_type_analysis_df = pd.read_sql_query(crime_type_query, conn)

# Calculate the distribution of crime types as a percentage of total crimes
total_crimes = crime_type_analysis_df['Crime Count'].sum()
crime_type_analysis_df['Crime Distribution (%)'] = (crime_type_analysis_df['Crime Count'] / total_crimes) * 100

# Close the connection
conn.close()

# Plot the distribution of crime types
plt.figure(figsize=(10, 6))
plt.bar(crime_type_analysis_df['Primary Type'], crime_type_analysis_df['Crime Distribution (%)'])
plt.xlabel('Crime Type')
plt.ylabel('Distribution (%)')
plt.title('Distribution of Crime Types')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()

In [None]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
conn = sqlite3.connect('crime_data.db')

# Location-Based Analysis
location_based_query = """
    SELECT "Location Description", COUNT(*) AS "Crime Count"
    FROM crimes
    GROUP BY "Location Description"
    ORDER BY "Crime Count" DESC
"""
location_based_analysis_df = pd.read_sql_query(location_based_query, conn)

# Close the connection
conn.close()

# Display the results
print("\nLocation-Based Analysis:")
print(location_based_analysis_df)

In [None]:
from pyspark.sql import SparkSession
import folium

# Initialize Spark session
spark = SparkSession.builder \
    .appName("SpatialAnalysis") \
    .getOrCreate()

# Load the dataset into a Spark DataFrame
df = spark.read.csv('Crimes_-_2023.csv', header=True, inferSchema=True)

# Select relevant columns containing latitude and longitude coordinates
location_df = df.select('Latitude', 'Longitude')

# Filter out rows with missing latitude or longitude values
location_df = location_df.filter(location_df['Latitude'].isNotNull() & location_df['Longitude'].isNotNull())

# Convert Spark DataFrame to Pandas DataFrame for visualization
location_pd_df = location_df.toPandas()

# Close the Spark session
spark.stop()

# Create a Folium map centered on the mean of latitude and longitude
crime_map = folium.Map(location=[location_pd_df['Latitude'].mean(), location_pd_df['Longitude'].mean()], zoom_start=10)

# Add markers for each crime hotspot
for index, row in location_pd_df.iterrows():
    folium.CircleMarker(location=[row['Latitude'], row['Longitude']], radius=2, color='red', fill=True, fill_color='red').add_to(crime_map)

# Display the map
crime_map