# Predicting Property Prices


## Import Libraries


In [1]:
# import libraries
import pandas as pd
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
import geopandas as gpd
from shapely.geometry import shape
import requests
import time
import warnings

In [2]:
warnings.filterwarnings('ignore')

## Prepare Data


### Utility Variables


In [3]:
# relevant area codes
area_codes = [
    'E08000011', 'E08000012', 'E11000002', 'E08000014', 'E08000013',
    'E08000007', 'E06000007', 'E08000010', 'E08000015'
]

# Regions in Merseyside
regions = [
    'Prenton', 'Newton-Le-Willows', 'Birkenhead',
    'Wirral', 'Bootle', 'St Helens', 'Wallasey', 'Southport',
    'Prescot', 'Wigan', 'Widnes', 'Neston', 'Warrington',
    'Ellesmere Port', 'Wilmslow', 'Coniston', 'Stockport', 'Northwood',
    'Crewe', 'Winsford', 'Merseyside', 'Sefton', 'Wirral', 'Liverpool', 'Knowsley'
]

### Utility Functions


In [4]:
def standardise_column(df, column_name, mapping, remove_values=None, new_dtype='category'):
    """
    Standardise a DataFrame column by applying a mapping, optionally removing specific values, 
    and changing the data type.

    Parameters:
    df (DataFrame): The input DataFrame.
    column_name (str): The name of the column to standardize.
    mapping (dict): A dictionary mapping old values to new values.
    remove_values (list, optional): A list of values to remove from the column. Default is None.
    new_dtype (str, optional): The new data type for the column. Default is 'category'.

    Returns:
    DataFrame: The DataFrame with the standardized column.
    """

    # Optionally remove rows with specific values
    if remove_values:
        df = df[~df[column_name].isin(remove_values)]

    # Apply the mapping to replace old values with new values
    df[column_name] = df[column_name].replace(mapping).astype(new_dtype)

    return df

In [5]:
def rename_columns(df, new_column_names):
    """
    Rename the columns of a DataFrame using a provided list of new column names.

    Parameters:
    df (DataFrame): The input DataFrame whose columns need to be renamed.
    new_column_names (list): A list of new column names.

    Returns:
    DataFrame: The DataFrame with renamed columns.
    """
    # Check if the number of new column names matches the number of columns in the DataFrame
    if len(new_column_names) != df.shape[1]:
        raise ValueError(
            "The number of new column names must match the number of columns in the DataFrame.")

    # Rename the columns
    df.columns = new_column_names

    return df

### Import


#### Prices Paid Data


In [6]:
def bulk_lookup_postcodes(postcodes, batch_size=100):
    """
    Perform a bulk lookup for postcodes using the postcodes.io API.

    Parameters:
    postcodes (list): A list of postcodes to lookup.
    batch_size (int): The number of postcodes to include in each batch (max 100).

    Returns:
    dict: A dictionary mapping postcodes to their respective data, including latitude, longitude, and termination data if applicable.
    """
    url = "https://api.postcodes.io/postcodes"
    headers = {'Content-Type': 'application/json'}
    results = {}

    # Process the postcodes in batches
    for i in range(0, len(postcodes), batch_size):
        batch = postcodes[i:i+batch_size]
        data = {"postcodes": batch}

        response = requests.post(url, json=data, headers=headers)

        if response.status_code == 200:
            response_data = response.json()
            for result in response_data['result']:
                postcode = result['query']
                if result['result'] is not None:
                    results[postcode] = {
                        'longitude': result['result']['longitude'],
                        'latitude': result['result']['latitude'],
                        'is_terminated': False
                    }
                else:
                    # If the postcode is terminated, set to None and mark as terminated
                    results[postcode] = {
                        'longitude': None,
                        'latitude': None,
                        'is_terminated': True
                    }
        else:
            print(
                f"Failed to retrieve data for batch starting at {i}: {response.status_code}")
        time.sleep(0.1)  # To avoid rate limiting

    return results

In [7]:

def handle_terminated_postcodes(postcode_data):
    """
    Handle postcodes that are marked as terminated by checking the terminated postcode API.

    Parameters:
    postcode_data (dict): A dictionary of postcodes with their data, including termination status.

    Returns:
    dict: Updated dictionary with longitude and latitude for terminated postcodes if available.
    """
    terminated_postcodes = [postcode for postcode,
                            data in postcode_data.items() if data['is_terminated']]

    if not terminated_postcodes:
        return postcode_data  # No terminated postcodes to handle

    # Query terminated postcodes in bulk (max 100 at a time)
    url = "https://api.postcodes.io/terminated_postcodes"
    headers = {'Content-Type': 'application/json'}
    batch_size = 100

    for i in range(0, len(terminated_postcodes), batch_size):
        batch = terminated_postcodes[i:i+batch_size]
        data = {"postcodes": batch}

        response = requests.post(url, json=data, headers=headers)

        if response.status_code == 200:
            response_data = response.json()
            for result in response_data['result']:
                postcode = result['query']
                if result['result'] is not None:
                    postcode_data[postcode]['longitude'] = result['result']['longitude']
                    postcode_data[postcode]['latitude'] = result['result']['latitude']
                else:
                    # If still no data, keep as None
                    print(f"No data found for terminated postcode: {postcode}")
        else:
            print(
                f"Failed to retrieve data for terminated postcodes batch starting at {i}: {response.status_code}")
        time.sleep(0.1)  # To avoid rate limiting

    return postcode_data

In [8]:
def add_long_lat_columns(df, postcode_column='postcode'):
    """
    Add longitude and latitude columns to the DataFrame based on the postcode column using Bulk Lookup.

    Parameters:
    df (DataFrame): The input DataFrame with a postcode column.
    postcode_column (str): The name of the column containing postcodes. Default is 'postcode'.

    Returns:
    DataFrame: The DataFrame with added 'longitude' and 'latitude' columns.
    """
    postcodes = df[postcode_column].unique().tolist()
    postcode_data = bulk_lookup_postcodes(postcodes)
    postcode_data = handle_terminated_postcodes(postcode_data)

    # Map the longitude and latitude back to the original DataFrame
    df['longitude'] = df[postcode_column].map(
        lambda x: postcode_data[x]['longitude'])
    df['latitude'] = df[postcode_column].map(
        lambda x: postcode_data[x]['latitude'])

    return df

In [9]:
# define wrangle function for prices paid data
def wrangle_prices_paid(filepath):

    # import file
    df = pd.read_csv(filepath)

    # drop complete duplicates from house_data
    df = df.drop_duplicates()

    # add column names to df
    column_names = [
        'transaction_id', 'price', 'transfer_date',
        'postcode', 'property_type', 'is_old_or_new',
        'property_tenure', 'house_number_or_name', 'unit_number',
        'street', 'locality', 'town', 'district', 'county',
        'ppd_transaction_category', 'record_status_monthly_file_only'
    ]
    df = rename_columns(df, column_names)
    
    # convert transfer date to datetime
    df['transfer_date'] = pd.to_datetime(df['transfer_date'])
    
    # sort data by teansaction date
    df = df.sort_values('transfer_date', ascending=True)
    
    # filter data for freehold transactions in merseyside from 2013 to 2023
    df = df[(df['county'] == 'MERSEYSIDE') & (
        df['transfer_date'].dt.year >= 2013) & (df['transfer_date'].dt.year <= 2023)]

    # convert price to float type
    df['price'] = df['price'].astype(float)
    
    # remove outliers in price by values in the bottom and top 5% of properties
    # low, high = df['price'].quantile([0.05, 0.95])
    # mask_area = df['price'].between(low, high)
    # df = df[mask_area]

    # convert ppd_transaction_category to category type
    df['ppd_transaction_category'] = df['ppd_transaction_category'].astype(
        'category')

    # define mappings for replacement
    property_type_mapping = {'T': 'Terraced', 'D': 'Detached', 'F': 'Flats/Maisonettes',
                             'S': 'Semi-Detached', 'O': 'Other'}
    old_or_new_mapping = {'N': 'Old', 'Y': 'New'}
    property_tenure_mapping = {'F': 'Freehold', 'L': 'Leasehold'}

    # standardize 'property_type' column
    df = standardise_column(df, 'property_type', property_type_mapping)
    # standardize 'is_old_or_new' column
    df = standardise_column(df, 'is_old_or_new', old_or_new_mapping)
    # standardize 'property_tenure' column and remove rows with 'U' before standardising
    df = standardise_column(df, 'property_tenure',
                            property_tenure_mapping, remove_values=['U'])

    # convert capital case columns to title case
    df['town'] = df['town'].str.title()
    df['district'] = df['district'].str.title()
    df['county'] = df['county'].str.title()
    
    # exclude rows with null postcode values
    df = df[~df['postcode'].isnull()]
    
    # created latitude and longitude columns
    df = add_long_lat_columns(df)

    # drop redundant columns
    df.drop(columns=['house_number_or_name', 'unit_number', 'locality',
                     'street', 'record_status_monthly_file_only', 'postcode'],
            inplace=True
            )

    return df

In [10]:
# Wrangle the prices paid data
prices_paid_data_df = wrangle_prices_paid(
    filepath="raw_data/prices_paid.csv")

# Display the first few rows to verify
prices_paid_data_df.head()

Failed to retrieve data for terminated postcodes batch starting at 0: 404
Failed to retrieve data for terminated postcodes batch starting at 100: 404
Failed to retrieve data for terminated postcodes batch starting at 200: 404


Unnamed: 0,transaction_id,price,transfer_date,property_type,is_old_or_new,property_tenure,town,district,county,ppd_transaction_category,longitude,latitude
18351985,{EF89E3A8-2BD1-4347-8B9B-F2CEEB2E62DC},75000.0,2013-01-02,Flats/Maisonettes,Old,Leasehold,Prenton,Wirral,Merseyside,A,-3.038801,53.384498
18251630,{CD1FD346-02E2-40B9-AD20-AF02A78999D1},113000.0,2013-01-02,Semi-Detached,Old,Freehold,Liverpool,Sefton,Merseyside,A,-2.946158,53.519373
18458777,{3008D681-978E-4FB2-B152-5077F80C64CF},79500.0,2013-01-02,Terraced,Old,Freehold,Birkenhead,Wirral,Merseyside,A,-3.019912,53.376715
18512520,{554C7E6D-FB60-4BF3-AEF0-F18802D4C110},385000.0,2013-01-02,Detached,Old,Freehold,Newton-Le-Willows,St Helens,Merseyside,A,-2.635595,53.479038
18275462,{3A27DE8C-0D42-41CC-8501-7367F7E98993},115000.0,2013-01-02,Semi-Detached,Old,Freehold,Liverpool,Liverpool,Merseyside,A,-2.904433,53.412073


#### Employment Data


In [11]:
# define wrangle function for employment data
def wrangle_employment(filepath):

    # import file
    df = pd.read_csv(filepath)

    # add column names to df
    column_names = ['area_code', 'area_name', 'year', 'employment_rate',
                    'confidence_interval_lower', 'confidence_interval_upper']
    df = rename_columns(df, column_names)
    
    # filter for employment rate from 2013 to 2023
    df = df[(df['year'] >= 2013) & (df['year'] <= 2023)]

    # relevant columns
    relevant_columns = ['area_code', 'area_name', 'year', 'employment_rate']
    # select relevant columns
    df = df[relevant_columns]

    # filter for relevant area codes and names
    df = df[(df['area_name'].isin(
        regions)) | (df['area_code'].isin(
            area_codes))].sort_values('year')
    
    # Convert the year column to string, then to datetime
    df['year'] = df['year'].astype(str)
    df['year'] = pd.to_datetime(
        df['year'], format='%Y')
    # to keep only the year part
    df['year'] = df['year'].dt.year
        
    # Convert employment rate from percentatage to rates
    df['employment_rate'] = df['employment_rate']/100
    
    # aggregate the data
    df = df.groupby(['area_code', 'area_name', 'year']).agg({
        'employment_rate': 'mean'
    }).reset_index()
    
    # drop rows with any missing values
    df = df.dropna()
    
    return df

In [12]:
# Wrangle the employment data
employment_data_df = wrangle_employment(filepath="raw_data/employment_data.csv")

# Display the first few rows to verify
employment_data_df.head()

Unnamed: 0,area_code,area_name,year,employment_rate
0,E06000007,Warrington,2013,0.782
1,E06000007,Warrington,2014,0.779
2,E06000007,Warrington,2015,0.779
3,E06000007,Warrington,2016,0.762
4,E06000007,Warrington,2017,0.774


#### UK HPI Data

In [13]:
# define wrangle function for hpi data
def wrangle_hpi(filepath):

    # import file
    df = pd.read_csv(filepath)
    
    # select relevant columns
    relevant_columns = [
        'Date', 'RegionName', 'AreaCode', 'AveragePrice', 'Index', '1m%Change', '12m%Change', 'SalesVolume'
    ]
    df = df[relevant_columns]
    
    # change column names
    column_names = ['date', 'region_name', 'area_code', 'average_price',
                    'index', '1m%_change', '12m%_change', 'sales_volume']
    df = rename_columns(df, column_names)
    
    # change date to dattime instead of object
    df['date'] = pd.to_datetime(df['date'])
    
    # Filter for the date range
    df = df[(df['date'] >= '2013-01-01') & (
        df['date'] <= '2023-12-31')]
    
    # filter for relevant area codes and names
    df = df[(df['region_name'].isin(
        regions)) | (df['area_code'].isin(
            area_codes))].sort_values('date')
    
    # drop complete duplicates
    df = df.drop_duplicates()
    
    # drop rows with any missing values
    df = df.dropna()
    
    return df

In [14]:
# Wrangle the HPI data
hpi_data_df = wrangle_hpi(filepath="raw_data/uk_hpi.csv")

# Display the first few rows to verify
hpi_data_df.head()

Unnamed: 0,date,region_name,area_code,average_price,index,1m%_change,12m%_change,sales_volume
86092,2013-01-01,Knowsley,E08000011,105390.6729,97.958916,0.304925,-1.857656,67.0
86105,2013-01-01,Liverpool,E08000012,105140.5433,93.986552,-0.314031,-2.0034,292.0
86115,2013-01-01,Merseyside,E11000002,118307.79,95.814439,-0.183908,-1.070033,891.0
86196,2013-01-01,Sefton,E08000014,135940.7468,98.603056,0.431996,-0.460236,176.0
86225,2013-01-01,St Helens,E08000013,107387.2971,96.951534,0.002608,0.660812,103.0


#### Income Data

In [15]:
# define wrangle function for hpi data
def wrangle_income(filepath):

    # import file
    df = pd.read_csv(filepath)
    
    # change column names
    column_names = ['area_code', 'area_name', 'year',
                        'gross_median_weekly_pay', 'confidence_interval_lower', 'confidence_interval_upper']
    df = rename_columns(df, column_names)
    

    # relevant columns
    relevant_columns = ['area_code', 'area_name',
                        'year', 'gross_median_weekly_pay']
    # select relevant columns
    df = df[relevant_columns]
    
    
    # Convert the year column to string, then to datetime
    df['year'] = df['year'].astype(str)
    df['year'] = pd.to_datetime(
        df['year'], format='%Y')

    # to keep only the year part
    df['year'] = df['year'].dt.year

    # filter for relevant area codes and names
    df = df[(df['area_name'].isin(
        regions)) | (df['area_code'].isin(
            area_codes))].sort_values('year')
    
    # aggregate the data
    df = df.groupby(['area_code', 'area_name', 'year']).agg({
        'gross_median_weekly_pay': 'mean'
    }).reset_index()

    # drop rows with any missing values
    df = df.dropna()
    
    return df

In [16]:
# Wrangle the income data
income_data_df = wrangle_income(filepath= "raw_data/income_data.csv")

# Display the first few rows to verify
income_data_df.head()

Unnamed: 0,area_code,area_name,year,gross_median_weekly_pay
0,E06000007,Warrington,2008,410.2
1,E06000007,Warrington,2009,424.2
2,E06000007,Warrington,2010,428.5
3,E06000007,Warrington,2011,402.7
4,E06000007,Warrington,2012,411.8


#### Crime Data

In [17]:
def create_sub_polygons(large_polygon, divisions=3):
    """
    Divide a large polygon into smaller sub-polygons.

    Parameters:
    large_polygon (list): A list of coordinates defining the large polygon.
    divisions (int): The number of divisions along each axis.

    Returns:
    list: A list of sub-polygons.
    """
    lat_min, lat_max = min(pt[0] for pt in large_polygon), max(
        pt[0] for pt in large_polygon)
    lng_min, lng_max = min(pt[1] for pt in large_polygon), max(
        pt[1] for pt in large_polygon)

    lat_step = (lat_max - lat_min) / divisions
    lng_step = (lng_max - lng_min) / divisions

    sub_polygons = []
    for i in range(divisions):
        for j in range(divisions):
            sub_polygon = [
                [lat_min + i * lat_step, lng_min + j * lng_step],
                [lat_min + i * lat_step, lng_min + (j + 1) * lng_step],
                [lat_min + (i + 1) * lat_step, lng_min + (j + 1) * lng_step],
                [lat_min + (i + 1) * lat_step, lng_min + j * lng_step],
                [lat_min + i * lat_step, lng_min + j * lng_step],
            ]
            sub_polygons.append(sub_polygon)

    return sub_polygons

In [18]:
def fetch_crime_data(sub_polygons, start_date, end_date, endpoint="https://data.police.uk/api/crimes-street/all-crime"):
    """
    Fetch crime data from the Police API for a set of sub-polygons over a date range.

    Parameters:
    sub_polygons (list): A list of sub-polygons.
    start_date (datetime): The start date for fetching data.
    end_date (datetime): The end date for fetching data.
    endpoint (str): The API endpoint for fetching crime data.

    Returns:
    DataFrame: A DataFrame containing the collected crime data.
    """
    all_crimes = []

    for sub_polygon in sub_polygons:
        polygon_str = ":".join([f"{lat},{lng}" for lat, lng in sub_polygon])

        current_date = start_date
        while current_date <= end_date:
            date_str = current_date.strftime("%Y-%m")

            # Make the API call using the sub-polygon
            api_url = f"{endpoint}?date={date_str}&poly={polygon_str}"

            # Send the request to the API
            response = requests.get(api_url)

            # Check if the request was successful
            if response.status_code == 200:
                crimes = response.json()
                all_crimes.extend(crimes)
            elif response.status_code == 503:
                print(
                    f"Request exceeded limit for {date_str} in sub-polygon"
                )
            else:
                print (
                    f"Failed to retrieve data for {date_str}: {response.status_code}"
                )
                
            # Move to the next month
            current_date += timedelta(days=31)
            # Ensure we start at the beginning of the next month
            current_date = current_date.replace(day=1)

    # Convert the collected data to a DataFrame
    df = pd.DataFrame(all_crimes)
    return df

In [19]:
def clean_crime_data(df):
    """
    Clean and extract relevant fields from the crime data DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame containing raw crime data.

    Returns:
    DataFrame: A cleaned DataFrame with only relevant fields.
    """
    # Extract latitude and longitude from the 'location' dictionary
    df['latitude'] = df['location'].apply(
        lambda x: x['latitude'])
    df['longitude'] = df['location'].apply(
        lambda x: x['longitude'])

    # Extract the 'id' and 'name' from the 'street' dictionary within the 'location' dictionary
    df['street_id'] = df['location'].apply(
        lambda x: x['street']['id'])
    df['street_name'] = df['location'].apply(
        lambda x: x['street']['name'])

    # Extract 'category' and 'date' from the 'outcome_status' dictionary
    df['outcome_category'] = df['outcome_status'].apply(
        lambda x: x['category'] if pd.notnull(x) else None)
    df['outcome_date'] = df['outcome_status'].apply(
        lambda x: x['date'] if pd.notnull(x) else None)

    # Drop the original 'location' and 'outcome_status' columns if they're no longer needed
    df.drop(columns=['location', 'outcome_status'], inplace=True)

    # Extract only relevant columns for crime location and category
    df = df[[
        'month', 'category', 'latitude', 'longitude']]

    return df


In [20]:
def wrangle_crime(large_polygon, divisions=3, start_date=datetime(2021, 6, 1), end_date=datetime(2023, 12, 31)):
    """
    Complete wrangling process for crime data, including fetching and cleaning.

    Parameters:
    large_polygon (list): A list of coordinates defining the large polygon.
    divisions (int): The number of divisions to create sub-polygons. Default is 3.
    start_date (datetime): The start date for fetching data. Default is June 2021.
    end_date (datetime): The end date for fetching data. Default is he last day of 2023.

    Returns:
    DataFrame: A cleaned DataFrame ready for analysis.
    """
    # Divide the large polygon into smaller sub-polygons
    sub_polygons = create_sub_polygons(large_polygon, divisions)

    # Fetch the crime data
    df = fetch_crime_data(sub_polygons, start_date, end_date)

    # Clean the crime data
    df = clean_crime_data(df)

    return df

In [21]:
# Define a large polygon that covers Merseyside
large_polygon = [
    [53.6967, -3.2603],  # Northwest corner (near Southport)
    [53.3700, -3.2603],  # Southwest corner (near Wirral)
    [53.3700, -2.5500],  # Southeast corner (near Warrington)
    [53.6967, -2.5500],  # Northeast corner (near Wigan)
    [53.6967, -3.2603],  # Closing the polygon back at the Northwest corner
]

# Wrangle the crime data
crime_data_df = wrangle_crime(large_polygon)

# Display the first few rows to verify
crime_data_df.head()

Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404
Failed to retrieve data for 2021-06: 404


Unnamed: 0,month,category,latitude,longitude
0,2021-07,anti-social-behaviour,53.384044,-3.04957
1,2021-07,anti-social-behaviour,53.404007,-3.116186
2,2021-07,anti-social-behaviour,53.403171,-3.058976
3,2021-07,anti-social-behaviour,53.422767,-3.0323
4,2021-07,anti-social-behaviour,53.394599,-3.025633


#### Flood Data

In [22]:
def fetch_flood_data(api_url="https://environment.data.gov.uk/flood-monitoring/id/floodAreas?&_limit=5000"):
    """
    Fetch flood area data from the Environment Agency API.

    Parameters:
    api_url (str): The API endpoint to fetch flood data. Default is set to the flood areas endpoint.

    Returns:
    list: A list of dictionaries containing flood area data.
    """
    response = requests.get(url=api_url)

    if response.status_code == 200:
        # Convert the response to JSON format and extract relevant data
        flood_data = response.json()
        flood_areas = flood_data.get('items', [])
        return flood_areas
    else:
        print(f"Failed to retrieve data: {response.status_code}")
        return []


In [23]:
def prepare_flood_data(flood_areas):
    """
    Prepare and clean flood area data by extracting relevant fields and converting to a DataFrame.

    Parameters:
    flood_areas (list): A list of dictionaries containing flood area data.

    Returns:
    DataFrame: A DataFrame containing the prepared flood area data.
    """
    flood_areas_list = []
    for area in flood_areas:
        flood_areas_list.append({
            'county': area.get('county'),
            'description': area.get('description'),
            'eaAreaName': area.get('eaAreaName'),
            'lat': area.get('lat'),
            'long': area.get('long'),
            'riverOrSea': area.get('riverOrSea'),
            'polygon': area.get('polygon')
        })

    # Convert to DataFrame
    df = pd.DataFrame(flood_areas_list)

    # Rename columns to more descriptive names
    column_names = ['county', 'text_description',
                    'area_name', 'lat', 'lon', 'water_source', 'polygon']
    df = rename_columns(df, column_names)
    
    df = df[df['area_name'].str.contains(
        'mersey', case=False, na=False)]
    
    return df

In [24]:
def fetch_geojson(uri):
    """
    Fetch and parse GeoJSON data from a polygon URI.

    Parameters:
    uri (str): The URI pointing to the GeoJSON polygon.

    Returns:
    geometry: A Shapely geometry object created from the GeoJSON data.
    """
    response = requests.get(uri)
    if response.status_code == 200:
        geojson = response.json()
        # Access the 'geometry' from the first feature
        if 'features' in geojson and len(geojson['features']) > 0:
            geometry = geojson['features'][0]['geometry']
            return shape(geometry)  # Convert GeoJSON to Shapely geometry
        else:
            print(f"No features found in GeoJSON data from {uri}")
            return None
    else:
        print(f"Failed to retrieve GeoJSON data from {uri}")
        return None


In [25]:

def convert_to_geodataframe(flood_areas_df):
    """
    Convert the DataFrame to a GeoDataFrame by applying the GeoJSON fetching function.

    Parameters:
    flood_areas_df (DataFrame): The input DataFrame containing flood area data.

    Returns:
    GeoDataFrame: A GeoDataFrame with Shapely geometries for each flood area.
    """
    # Apply the fetch_geojson function to the 'polygon' column
    flood_areas_df['geometry'] = flood_areas_df['polygon'].apply(fetch_geojson)

    # Convert the DataFrame to a GeoDataFrame
    flood_areas_gdf = gpd.GeoDataFrame(flood_areas_df, geometry='geometry')

    return flood_areas_gdf



In [26]:
def wrangle_flood_data():
    """
    Complete wrangling process for flood data, including fetching, cleaning, filtering, and conversion to GeoDataFrame.

    Returns:
    GeoDataFrame: A cleaned and filtered GeoDataFrame ready for spatial analysis.
    """
    # Fetch flood data from the API
    flood_areas = fetch_flood_data()

    # Prepare and clean the data
    df = prepare_flood_data(flood_areas)

    # Convert to a GeoDataFrame
    flood_areas_gdf = convert_to_geodataframe(df)

    return flood_areas_gdf


In [27]:
# Wrangle the flood data and prepare it for analysis
flood_data_gdf = wrangle_flood_data()

# Display the first few rows to verify
flood_data_gdf.head()

Unnamed: 0,county,text_description,area_name,lat,lon,water_source,polygon,geometry
28,Manchester,Land adjacent to the River Mersey at West Dids...,Gtr Mancs Mersey and Ches,53.411,-2.2416,River Mersey,http://environment.data.gov.uk/flood-monitorin...,"POLYGON ((-2.24472 53.4153, -2.24508 53.41532,..."
29,"Cheshire East, Manchester, Salford, Stockport,...",The Middle River Mersey catchment includes Mic...,Gtr Mancs Mersey and Ches,53.43479,-2.31497,River Mersey,http://environment.data.gov.uk/flood-monitorin...,"MULTIPOLYGON (((-2.10956 53.32586, -2.10994 53..."
51,Manchester,Areas at risk include land and properties arou...,Gtr Mancs Mersey and Ches,53.51869,-2.22357,River Irk,http://environment.data.gov.uk/flood-monitorin...,"MULTIPOLYGON (((-2.22293 53.51522, -2.22231 53..."
52,"Bolton, Bury, Manchester, Oldham, Rochdale, Sa...",The Lower River Irwell catchment also includes...,Gtr Mancs Mersey and Ches,53.48905,-2.28848,River Irwell,http://environment.data.gov.uk/flood-monitorin...,"MULTIPOLYGON (((-2.23183 53.46984, -2.23233 53..."
64,"Manchester, Stockport","Areas in the locality of Mauldeth Road, includ...",Gtr Mancs Mersey and Ches,53.43947,-2.23322,Cringle Brook,http://environment.data.gov.uk/flood-monitorin...,"MULTIPOLYGON (((-2.22034 53.43457, -2.22038 53..."


### Merge Data

In [28]:
def merge_dataframes(prices_paid_df, employment_df, hpi_df, income_df, crime_df, flood_gdf):
    """
    Merge multiple dataframes and a geodataframe on relevant keys and spatial relationships.

    Parameters:
    prices_paid_df (DataFrame): Cleaned prices paid data.
    employment_df (DataFrame): Employment data.
    hpi_df (DataFrame): House price index (HPI) data.
    income_df (DataFrame): Income data.
    crime_df (DataFrame): Crime data.
    flood_gdf (GeoDataFrame): Flood risk area data.

    Returns:
    DataFrame: A merged dataframe combining all relevant data.
    """
    # Merge employment data based on town and year
    prices_paid_df['year'] = prices_paid_df['transfer_date'].dt.year
    merged_df = pd.merge(prices_paid_df, employment_df, left_on=[
                        'town', 'year'], right_on=['area_name', 'year'], how='left')

    # Merge HPI data based on area_code and date (rounding date to nearest month)
    merged_df = pd.merge(merged_df, hpi_df, left_on=[
                         'district', 'transfer_date'], right_on=['region_name', 'date'], how='left')

    # Merge income data based on area_code and year
    merged_df = pd.merge(merged_df, income_df, left_on=[
                        'district', 'year'], right_on=['area_name', 'year'], how='left')

    # Merge crime data based on nearest location (latitude and longitude)
    # For this, we'll need to create spatial points from the latitude and longitude
    prices_paid_gdf = gpd.GeoDataFrame(merged_df, geometry=gpd.points_from_xy(
        merged_df.longitude, merged_df.latitude))
    crime_gdf = gpd.GeoDataFrame(crime_df, geometry=gpd.points_from_xy(
        crime_df.longitude, crime_df.latitude))

    # Spatial join to find the nearest crime data points
    merged_gdf = gpd.sjoin_nearest(
        prices_paid_gdf, crime_gdf, how='left', distance_col="crime_distance")

    # Merge flood risk area data based on spatial join
    merged_gdf = gpd.sjoin(merged_gdf, flood_gdf, how='left', op='intersects')

    # Convert the final GeoDataFrame back to a DataFrame if needed
    final_df = pd.DataFrame(merged_gdf.drop(columns='geometry'))

    return final_df


# Example usage
merged_data = merge_dataframes(
    prices_paid_df, employment_df, hpi_df, income_df, crime_df, flood_gdf)

# Display the first few rows of the merged dataframe
print(merged_data.head())

NameError: name 'prices_paid_df' is not defined

In [None]:
prices_copy = prices_paid_data_df.copy()
employment_copy = employment_data_df.copy()
income_copy = income_data_df.copy()
flood_copy = flood_data_gdf.copy()
hpi_copy = hpi_data_df.copy()
crime_copy = crime_data_df.copy()

In [None]:
# Merge employment data based on area_code and year
prices_copy['year'] = prices_copy['transfer_date'].dt.year
# Merge using district instead of town
merged_prices_employment_copy = pd.merge(
    prices_copy,
    employment_copy,
    left_on=['district', 'year'],
    right_on=['area_name', 'year'],
    how='left'
)

# Forward fill any missing values in the merged dataframe
merged_prices_employment_copy.fillna(method='ffill', inplace=True)

# Display the first few rows to verify
print(merged_prices_employment_copy.head())

In [None]:
merged_prices_employment_copy.head()

In [None]:
hpi_copy.head()

In [None]:
# Round transfer_date to the nearest month to match HPI data
merged_prices_employment_copy['transfer_date_month'] = merged_prices_employment_copy['transfer_date'].dt.to_period(
    'M')

# Convert HPI date to the same period format for merging
hpi_copy['date_month'] = pd.to_datetime(hpi_copy['date']).dt.to_period('M')

# Merge the dataframes
merged_prices_employment_hpi_copy = pd.merge(
    merged_prices_employment_copy,
    hpi_copy,
    left_on=['district', 'transfer_date_month'],
    right_on=['region_name', 'date_month'],
    how='left'
)

# Forward fill any missing values in the merged dataframe
merged_prices_employment_hpi_copy.fillna(method='ffill', inplace=True)

# Drop the temporary 'transfer_date_month' and 'date_month' columns if needed
merged_prices_employment_hpi_copy.drop(
    columns=['transfer_date_month', 'date_month'], inplace=True)

# Display the first few rows to verify the merge
merged_prices_employment_hpi_copy.head()

In [None]:
merged_prices_employment_hpi_copy.info()

In [None]:
# Assuming merged_prices_employment_hpi_copy is the DataFrame after merging prices, employment, and HPI data

# Merge the dataframes based on district (area_name) and year
merged_prices_employment_hpi_income_copy = pd.merge(
    merged_prices_employment_hpi_copy,
    income_copy,
    left_on=['district', 'year'],
    right_on=['area_name', 'year'],
    how='left'
)

# Forward fill any missing values in the merged dataframe
merged_prices_employment_hpi_income_copy.fillna(method='ffill', inplace=True)

# Display the first few rows to verify the merge
merged_prices_employment_hpi_income_copy.head()

In [None]:
merged_prices_employment_hpi_income_copy.info()

In [None]:
crime_copy