In [5]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import re

import sys
sys.path.append('functions')
import preprocessing_fncs as ppf
import elastic_search_fncs as esf

# Connecting

In [9]:
# Details of the dataset
db_host = 'https://athena.london.gov.uk'
db_user = 'odbc_readonly'
db_pass = 'odbc_readonly'
db_port = '10099'
db_name = 'gla-ldd-external'

# Creates connection to the dataset
es = Elasticsearch(
    [f"{db_host}:{db_port}"],
    basic_auth=(db_user, db_pass),
    verify_certs=True,
    ca_certs='athena_es_full_chain.crt'
)

# Check connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

Connected to Elasticsearch!


# Get the data from 2015 to 2019

In [10]:
all_years_df = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            # valid data between 2015-019
                            "valid_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
                # The conditions that should be met
                "should": [
                    {
                        "range": {
                            "application_details.residential_details.total_no_existing_residential_units": {
                                "gte": 1
                            }
                        }
                    },
                    {
                        "range": {
                            "application_details.residential_details.total_no_proposed_residential_units": {
                                "gte": 1
                            }
                        }
                    }
                ],
                "minimum_should_match": 1 # At least meet one of the condition
            }
        },
        "_source": [
            "valid_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df.append(df_cleaned)

# combined all the data
df_london_all = pd.concat(all_years_df, ignore_index=True)


  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [12]:
print(df_london_all.shape)
print(df_london_all.head())

(9717, 15)
          site_name valid_date  \
0  Residential Unit 2015-06-23   
1                   2015-05-11   
2      2 Elers Road 2015-05-05   
3        Rose Court 2015-08-21   
4                   2015-09-17   

                                  polygon.geometries        polygon.type  \
0  [{'coordinates': [[[529220.0027867, 186434.103...  GeometryCollection   
1  [{'coordinates': [[[529924.4937875, 186185.904...  GeometryCollection   
2  [{'coordinates': [[[517284.89757, 179818.14684...  GeometryCollection   
3  [{'coordinates': [[[524961.5441693, 171136.692...  GeometryCollection   
4                                                NaN                 NaN   

                           wgs84_polygon.coordinates wgs84_polygon.type  \
0  [[[-0.137201, 51.5619933], [-0.137201699999999...            Polygon   
1  [[[-0.1271351, 51.5596012], [-0.1271822, 51.55...            Polygon   
2  [[[-0.3115011, 51.5051362], [-0.311508, 51.505...            Polygon   
3  [[[-0.2040145, 51.425470

In [13]:
all_years_df2 = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            # desition data between 2015-019
                            "decision_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
                # The conditions that should be met
                "should": [
                    {
                        "range": {
                            "application_details.residential_details.total_no_existing_residential_units": {
                                "gte": 1
                            }
                        }
                    },
                    {
                        "range": {
                            "application_details.residential_details.total_no_proposed_residential_units": {
                                "gte": 1
                            }
                        }
                    }
                ],
                "minimum_should_match": 1 # At least meet one of the condition
            }
        },
        "_source": [
            "decision_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df2.append(df_cleaned)

# combined all the data
df_london_all2 = pd.concat(all_years_df2, ignore_index=True)

  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [14]:
print(df_london_all2.shape)

(31718, 15)


There is a large gap between the decision date and the valid date.

1. change into decision date?

2. stick to valid date but change the range to a longer time period?

# All applications between 2015 - 2019

In [17]:
all_years_df3 = [] # save the data into this dataframe

for year in range(2015, 2020):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            "valid_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
            }
        },
        "_source": [
            "valid_date",
            "decision_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    all_years_df3.append(df_cleaned)

# combined all the data
df_london_all3 = pd.concat(all_years_df3, ignore_index=True)

  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)
  response = es.search(index="applications", body=query, scroll="2m", size=10000)


In [18]:
print(df_london_all3.shape)

(192635, 16)
