# Crime Data

## API Data

### Import libraries

In [1]:
# import libraries
import pandas as pd
import geopandas as gpd
from shapely.geometry import shape
import requests
import warnings
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')

### Prepare Data

#### Import

In [None]:
def create_sub_polygons(large_polygon, divisions=3):
    """
    Divide a large polygon into smaller sub-polygons.

    Parameters:
    large_polygon (list): A list of coordinates defining the large polygon.
    divisions (int): The number of divisions along each axis.

    Returns:
    list: A list of sub-polygons.
    """
    lat_min, lat_max = min(pt[0] for pt in large_polygon), max(
        pt[0] for pt in large_polygon)
    lng_min, lng_max = min(pt[1] for pt in large_polygon), max(
        pt[1] for pt in large_polygon)

    lat_step = (lat_max - lat_min) / divisions
    lng_step = (lng_max - lng_min) / divisions

    sub_polygons = []
    for i in range(divisions):
        for j in range(divisions):
            sub_polygon = [
                [lat_min + i * lat_step, lng_min + j * lng_step],
                [lat_min + i * lat_step, lng_min + (j + 1) * lng_step],
                [lat_min + (i + 1) * lat_step, lng_min + (j + 1) * lng_step],
                [lat_min + (i + 1) * lat_step, lng_min + j * lng_step],
                [lat_min + i * lat_step, lng_min + j * lng_step],
            ]
            sub_polygons.append(sub_polygon)

    return sub_polygons

In [None]:
def fetch_crime_data(sub_polygons, start_date, end_date, endpoint="https://data.police.uk/api/crimes-street/all-crime"):
    """
    Fetch crime data from the Police API for a set of sub-polygons over a date range.

    Parameters:
    sub_polygons (list): A list of sub-polygons.
    start_date (datetime): The start date for fetching data.
    end_date (datetime): The end date for fetching data.
    endpoint (str): The API endpoint for fetching crime data.

    Returns:
    DataFrame: A DataFrame containing the collected crime data.
    """
    all_crimes = []

    for sub_polygon in sub_polygons:
        polygon_str = ":".join([f"{lat},{lng}" for lat, lng in sub_polygon])

        current_date = start_date
        while current_date <= end_date:
            date_str = current_date.strftime("%Y-%m")

            # Make the API call using the sub-polygon
            api_url = f"{endpoint}?date={date_str}&poly={polygon_str}"

            # Send the request to the API
            response = requests.get(api_url)

            # Check if the request was successful
            if response.status_code == 200:
                crimes = response.json()
                all_crimes.extend(crimes)
            elif response.status_code == 503:
                print(
                    f"Request exceeded limit for {date_str} in sub-polygon"
                )
            else:
                print(
                    f"Failed to retrieve data for {date_str}: {response.status_code}"
                )

            # Move to the next month
            current_date += timedelta(days=31)
            # Ensure we start at the beginning of the next month
            current_date = current_date.replace(day=1)

    # Convert the collected data to a DataFrame
    df = pd.DataFrame(all_crimes)
    return df

In [None]:
def clean_crime_data(df):
    """
    Clean and extract relevant fields from the crime data DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing raw crime data.

    Returns:
    DataFrame: A cleaned DataFrame with only relevant fields.
    """
    # Extract latitude and longitude from the 'location' dictionary
    df['latitude'] = df['location'].apply(
        lambda x: x['latitude'])
    df['longitude'] = df['location'].apply(
        lambda x: x['longitude'])

    # Extract the 'id' and 'name' from the 'street' dictionary within the 'location' dictionary
    df['street_id'] = df['location'].apply(
        lambda x: x['street']['id'])
    df['street_name'] = df['location'].apply(
        lambda x: x['street']['name'])

    # Extract 'category' and 'date' from the 'outcome_status' dictionary
    df['outcome_category'] = df['outcome_status'].apply(
        lambda x: x['category'] if pd.notnull(x) else None)
    df['outcome_date'] = df['outcome_status'].apply(
        lambda x: x['date'] if pd.notnull(x) else None)

    # Drop the original 'location' and 'outcome_status' columns if they're no longer needed
    df.drop(columns=['location', 'outcome_status'], inplace=True)

    # Extract only relevant columns for crime location and category
    df = df[[
        'month', 'category', 'latitude', 'longitude']]

    return df

In [None]:
def wrangle_crime(large_polygon, divisions=3, start_date=datetime(2021, 6, 1), end_date=datetime(2023, 12, 31)):
    """
    Complete wrangling process for crime data, including fetching and cleaning.

    Parameters:
    large_polygon (list): A list of coordinates defining the large polygon.
    divisions (int): The number of divisions to create sub-polygons. Default is 3.
    start_date (datetime): The start date for fetching data. Default is June 2021.
    end_date (datetime): The end date for fetching data. Default is he last day of 2023.

    Returns:
    DataFrame: A cleaned DataFrame ready for analysis.
    """
    # Divide the large polygon into smaller sub-polygons
    sub_polygons = create_sub_polygons(large_polygon, divisions)

    # Fetch the crime data
    df = fetch_crime_data(sub_polygons, start_date, end_date)

    # Clean the crime data
    df = clean_crime_data(df)

    return df

In [None]:
# Define a large polygon that covers Merseyside
large_polygon = [
    [53.6967, -3.2603],  # Northwest corner (near Southport)
    [53.3700, -3.2603],  # Southwest corner (near Wirral)
    [53.3700, -2.5500],  # Southeast corner (near Warrington)
    [53.6967, -2.5500],  # Northeast corner (near Wigan)
    [53.6967, -3.2603],  # Closing the polygon back at the Northwest corner
]

# Wrangle the crime data
crime_data_df = wrangle_crime(large_polygon)

# Display the first few rows to verify
crime_data_df.head()

## For downloaded data

In [None]:
import os
import glob
import shutil


def copy_merseyside_csv_files(base_dir, dest_dir, pattern='*-merseyside-street.csv'):
    """
    Identify and copy merseyside-street CSV files to a new destination folder.
    
    Parameters:
    base_dir (str): The base directory containing the crime data folders.
    dest_dir (str): The destination directory to store the identified CSV files.
    pattern (str): The pattern to match CSV files in each subdirectory. Default is '*-merseyside-street.csv'.
    
    Returns:
    None
    """
    # Create the destination directory if it doesn't exist
    os.makedirs(dest_dir, exist_ok=True)

    # Iterate over the main subdirectories in base_dir
    for folder in ['2013-01 to 2017-04', '2017-05 to 2020-04', '2020-05 to 2023-12']:
        full_folder_path = os.path.join(base_dir, folder)
        print(f"Checking folder: {full_folder_path}")  # Debug print

        # Match CSV files in the current folder and its subdirectories
        csv_files = glob.glob(os.path.join(
            full_folder_path, '**', pattern), recursive=True)

        if not csv_files:
            print(
                f"No files found in {full_folder_path} with pattern {pattern}")

        for file in csv_files:
            # Copy the file to the destination directory
            shutil.copy(file, dest_dir)
            print(f"Copied {file} to {dest_dir}")


# Example usage
base_directory = 'raw_data'  # The base directory where the crime data is stored
# The folder where you want to store the merseyside-street CSV files
destination_directory = 'raw_data/merseyside_csv_files'

copy_merseyside_csv_files(base_directory, destination_directory)