In [1]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import re

import sys
sys.path.append('functions')
import preprocessing_fncs as ppf
import elastic_search_fncs as esf

# Connecting

In [2]:
# Details of the dataset
db_host = 'https://athena.london.gov.uk'
db_user = 'odbc_readonly'
db_pass = 'odbc_readonly'
db_port = '10099'
db_name = 'gla-ldd-external'

# Creates connection to the dataset
es = Elasticsearch(
    [f"{db_host}:{db_port}"],
    basic_auth=(db_user, db_pass),
    verify_certs=True,
    ca_certs='athena_es_full_chain.crt'
)

# Check connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

Connected to Elasticsearch!


# Get the data from 2015 to 2019

## Existing Residential Data

In [4]:
all_years_df = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            # valid data between 2015-019
                            "valid_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
                # The conditions that should be met
                "should": [
                    {
                        "range": {
                            "application_details.residential_details.total_no_existing_residential_units": {
                                "gte": 1
                            }
                        }
                    },
                    {
                        "range": {
                            "application_details.residential_details.total_no_proposed_residential_units": {
                                "gte": 1
                            }
                        }
                    }
                ],
                "minimum_should_match": 1 # At least meet one of the condition
            }
        },
        "_source": [
            "valid_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df.append(df_cleaned)

# combined all the data
df_london_all = pd.concat(all_years_df, ignore_index=True)


  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [5]:
print(df_london_all.shape)
print(df_london_all.head())

(9727, 15)
      site_name valid_date                                 polygon.geometries  \
0          None 2015-07-20  [{'coordinates': [[[525219.5, 191405.95], [525...   
1     101 - 109 2015-11-11  [{'coordinates': [[[516873.0468631, 179643.639...   
2               2015-12-21  [{'coordinates': [[[531625.2932364, 185303.246...   
3  Oculus House 2015-09-21  [{'coordinates': [[[544276.1, 184398.4], [5442...   
4             3 2015-02-23  [{'coordinates': [[[517123.7478508, 181282.847...   

         polygon.type                          wgs84_polygon.coordinates  \
0  GeometryCollection  [[[-0.193111, 51.6075766], [-0.1929922, 51.607...   
1  GeometryCollection  [[[-0.3174902, 51.503653], [-0.3172725, 51.503...   
2  GeometryCollection  [[[-0.10294429999999999, 51.5512749], [-0.1029...   
3  GeometryCollection  [[[0.0790239, 51.540053], [0.0790787, 51.54002...   
4  GeometryCollection  [[[-0.3133355, 51.5183339], [-0.31333639999999...   

  wgs84_polygon.type  total_no_proposed_resid

## Proposed Residential Data

In [13]:
all_years_df2 = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            # desition data between 2015-019
                            "decision_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
                # The conditions that should be met
                "should": [
                    {
                        "range": {
                            "application_details.residential_details.total_no_existing_residential_units": {
                                "gte": 1
                            }
                        }
                    },
                    {
                        "range": {
                            "application_details.residential_details.total_no_proposed_residential_units": {
                                "gte": 1
                            }
                        }
                    }
                ],
                "minimum_should_match": 1 # At least meet one of the condition
            }
        },
        "_source": [
            "decision_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df2.append(df_cleaned)

# combined all the data
df_london_all2 = pd.concat(all_years_df2, ignore_index=True)

  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [14]:
print(df_london_all2.shape)

(31718, 15)


There is a large gap between the decision date and the valid date.

1. change into decision date?

2. stick to valid date but change the range to a longer time period?

## All applications between 2015 - 2019

In [7]:
all_years_df3 = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            "valid_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
            }
        },
        "_source": [
            "valid_date",
            "decision_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df3.append(df_cleaned)

# combined all the data
df_london_all3 = pd.concat(all_years_df3, ignore_index=True)

  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [8]:
print(df_london_all3.shape)

(192685, 16)


# Data Cleaning

- Select the text (description column, delete NA lines)
- Clean the text (excessive spaces and special characters)
- Split the descriptions into sentences
- Vectorizes sentences using SBERT

In [None]:
import re
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
import nltk

nltk.download('punkt')      # 正常句子分词模型
nltk.download('punkt_tab')  # （尽管它不是必须的，但这样能绕过特定 bug）

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [28]:
from sentence_transformers import SentenceTransformer

In [20]:
# reconfirm the columns
df_london_all3.columns 

Index(['site_name', 'decision_date', 'valid_date', 'polygon.geometries',
       'polygon.type', 'wgs84_polygon.coordinates', 'wgs84_polygon.type',
       'description', 'borough', 'street_name',
       'total_no_proposed_residential_units',
       'total_no_existing_residential_units', 'polygon', 'wgs84_polygon',
       'polygon.coordinates', 'year'],
      dtype='object')

In [21]:
# set a new copy
df = df_london_all3.copy()
# keep only the non-empty text
df = df[df['description'].notna()]

In [22]:
# excessive spaces and special characters
df['description'] = df['description'].str.replace(r'\s+', ' ', regex=True).str.strip()

In [26]:
# Split
df['sentences'] = df['description'].apply(sent_tokenize)

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')
