In [5]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import re

import sys
sys.path.append('functions')
import preprocessing_fncs as ppf
import elastic_search_fncs as esf

import json

In [6]:
# Details of the dataset
db_host = 'https://athena.london.gov.uk'
db_user = 'odbc_readonly'
db_pass = 'odbc_readonly'
db_port = '10099'
db_name = 'gla-ldd-external'

# Creates connection to the dataset
es = Elasticsearch(
    [f"{db_host}:{db_port}"],
    basic_auth=(db_user, db_pass),
    verify_certs=True,
    ca_certs='athena_es_full_chain.crt'
)

# Check connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

Connected to Elasticsearch!


In [7]:
mapping = es.indices.get_mapping(index="applications")
print(mapping["applications"]["mappings"]["properties"])

{'actual_commencement_date': {'type': 'date', 'format': 'dd/MM/yyyy'}, 'actual_completion_date': {'type': 'date', 'format': 'dd/MM/yyyy'}, 'appeal_decision': {'type': 'text', 'fields': {'raw': {'type': 'keyword'}}}, 'appeal_decision_date': {'type': 'date', 'format': 'dd/MM/yyyy'}, 'appeal_start_date': {'type': 'date', 'format': 'dd/MM/yyyy'}, 'appeal_status': {'type': 'text', 'fields': {'raw': {'type': 'keyword'}}}, 'application_details': {'dynamic': 'strict', 'properties': {'3d_model': {'type': 'boolean'}, 'affordable_housing_fast_track': {'type': 'boolean'}, 'affordable_housing_inlieu_payment': {'type': 'integer'}, 'air_quality_assessment': {'type': 'boolean'}, 'borough_cil_liability_notice_issued': {'type': 'boolean'}, 'building_age': {'type': 'text', 'fields': {'raw': {'type': 'keyword'}}}, 'building_details': {'type': 'nested', 'dynamic': 'strict', 'properties': {'building_ref': {'type': 'text'}, 'max_height': {'type': 'float'}, 'no_storeys': {'type': 'integer'}}}, 'building_type'

no column related to lsoa, so we need `wgs84_polygon.coordinates` or `polygon.geometries` to match lsoa

In [8]:
# Obtain the field definition section
properties = mapping["applications"]["mappings"]["properties"]

# Recursively extract the field paths and types
def extract_fields(mapping_dict, prefix=""):
    fields = []
    for key, value in mapping_dict.items():
        full_key = f"{prefix}.{key}" if prefix else key
        if "type" in value:
            fields.append({"field": full_key, "type": value["type"]})
        elif "properties" in value:
            fields.extend(extract_fields(value["properties"], full_key))
    return fields

flat_fields = extract_fields(properties)

# turn into DataFrame
df_fields = pd.DataFrame(flat_fields)

# save to CSV
df_fields.to_csv("elasticsearch_mapping_fields.csv", index=False)

print("save to elasticsearch_mapping_fields.csv")


save to elasticsearch_mapping_fields.csv


In [9]:
# create query
query = {
    "query": {
        "range": {
            "valid_date": {
                "gte": "01/01/2015",
                "lte": "31/12/2019"
            }
        }
    },
    "size": 10 
}

In [10]:
res = es.search(index="applications", body=query)

# total number of project
count_query = {
    "query": {
        "range": {
            "valid_date": {
                "gte": "01/01/2015",
                "lte": "31/12/2019"
            }
        }
    }
}
total = es.count(index="applications", body=count_query)
print("2015–2019 total number：", total["count"])

2015–2019 total number： 192627


In [11]:
query2 = {
    "query": {
        "range": {
            "valid_date": {
                "gte": "01/01/2015",
                "lte": "31/12/2015" # try one year
            }
        }
    },
    "size": 10000,  
    "_source": [
        "description",
        "borough",
        "valid_date",
        "application_details.residential_details.total_no_existing_residential_units",
    ]
}

res = es.search(index="applications", body=query2)
docs = [hit["_source"] for hit in res["hits"]["hits"]]
df = pd.DataFrame(docs)

# Extract the digital segments of the existing residential units
df["existing_units"] = df["application_details"].apply(
    lambda x: x.get("residential_details", {}).get("total_no_existing_residential_units", 0)
    if isinstance(x, dict) else 0
)

# Filter out projects with residential properties
df_resi = df[df["existing_units"] > 0]

print("total number related to existing units:", len(df_resi))


total number related to existing units: 202


In [12]:
# try to extand the grasping volume
from elasticsearch.helpers import scan

def scroll_query(es, index="applications", query=None, scroll_size=1000, scroll_time="2m", max_docs=None):

    docs = []

    for i, hit in enumerate(scan(
        client=es,
        index=index,
        query=query,
        scroll=scroll_time,
        size=scroll_size,
        preserve_order=True
    )):
        docs.append(hit["_source"])
        if max_docs and len(docs) >= max_docs:
            break
        if len(docs) % 5000 == 0:
            print(f"fetch {len(docs)} ...")

    print(f"already fetch {len(docs)} projects.")
    return pd.DataFrame(docs)


In [14]:
query = {
    "query": {
        "range": {
            "valid_date": {
                "gte": "01/01/2015",
                "lte": "31/12/2019"
            }
        }
    },
    "_source": [
        "description",
        "borough",
        "valid_date",
        "application_details.residential_details.total_no_existing_residential_units",
        "wgs84_polygon",
        "application_details.residential_details.dwelling_density",
    ]
}

# fetch
df_all = scroll_query(es, index="applications", query=query, max_docs=max)

# save to csv
df_all.to_csv("planning_applications_2015_2019.csv", index=False)

TypeError: '>=' not supported between instances of 'int' and 'builtin_function_or_method'

In [None]:
# only residential
df_all["existing_units"] = df_all["application_details"].apply(
    lambda x: x.get("residential_details", {}).get("total_no_existing_residential_units", 0)
    if isinstance(x, dict) else 0
)

df_resi = df_all[df_all["existing_units"] > 0]
df_resi.head()
len(df_resi)

4570

In [None]:
# filter out density
df_all["dew_density"] = df_all["application_details"].apply(
    lambda x: x.get("residential_details", {}).get("dwelling_density", 0)
    if isinstance(x, dict) else 0
)

df_den = df_all[df_all["dew_density"] > 0]
len(df_den)

7544

In [None]:
df_both = df_all[(df_all["existing_units"] > 0) | (df_all["dew_density"] > 0)]
len(df_both)

8600

In [None]:
# filter out prosposed
df_all["pros_res"] = df_all["application_details"].apply(
    lambda x: x.get("residential_details", {}).get("total_no_proposed_residential_units", 0)
    if isinstance(x, dict) else 0
)

df_pro = df_all[df_all["pros_res"] > 0]
len(df_pro)
# how can it be 0???

0

In [None]:
# try tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. 预处理 description 字段
df_both["description_clean"] = df_both["description"].fillna("")

# 2. 初始化 TF-IDF 向量器
tfidf_vectorizer = TfidfVectorizer(
    stop_words="english",  # 去掉常见词
    lowercase=True,
    max_features=1000,     # 只保留 top 1000 个重要词
    token_pattern=r"\b[a-zA-Z]{3,}\b"  # 仅匹配英文单词（3个字母以上）
)

# 3. 拟合 TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df_both["description_clean"])

# 4. 提取词和对应 IDF 值
idf_scores = tfidf_vectorizer.idf_
tfidf_df = pd.DataFrame({
    "word": tfidf_vectorizer.get_feature_names_out(),
    "idf": idf_scores
}).sort_values(by="idf", ascending=False).reset_index(drop=True)

# 5. 展示最有区分度的词（高 IDF → 出现少但有代表性）
tfidf_df.head(30)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_both["description_clean"] = df_both["description"].fillna("")


Unnamed: 0,word,idf
0,tintern,9.366487
1,rugby,9.366487
2,packington,8.961021
3,grafton,8.961021
4,approach,8.961021
5,sherwood,8.961021
6,crossways,8.961021
7,clubhouse,8.961021
8,bath,8.961021
9,caledonian,8.673339


In [None]:
# try LDA
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# 预处理文本
df_both["description_clean"] = df_both["description"].fillna("")

# 向量化（稀疏词频矩阵）
vectorizer = CountVectorizer(
    stop_words='english',
    lowercase=True,
    token_pattern=r'\b[a-zA-Z]{3,}\b',
    max_features=1000  # 限制词汇量
)
doc_term_matrix = vectorizer.fit_transform(df_both["description_clean"])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_both["description_clean"] = df_both["description"].fillna("")


In [None]:
# 设置主题数
n_topics = 6

lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
lda.fit(doc_term_matrix)

In [None]:
def display_topics(model, feature_names, top_words=10):
    for idx, topic in enumerate(model.components_):
        print(f"\n Topic #{idx+1}:")
        top_features = topic.argsort()[::-1][:top_words]
        print(" + ".join([feature_names[i] for i in top_features]))

display_topics(lda, vectorizer.get_feature_names_out())


 Topic #1:
storage + associated + cycle + refuse + storey + flats + parking + self + contained + single

 Topic #2:
units + residential + parking + associated + buildings + landscaping + planning + car + use + site

 Topic #3:
storey + existing + erection + demolition + building + bedroom + dwelling + detached + new + basement

 Topic #4:
use + house + class + change + single + hmo + dwelling + multiple + occupation + generis

 Topic #5:
use + class + residential + change + contained + self + floor + flats + existing + certificate

 Topic #6:
rear + extension + floor + roof + conversion + ground + flats + storey + single + contained
