In [11]:
import pandas as pd
import numpy as np
from elasticsearch import Elasticsearch
import re

import sys
sys.path.append('functions')
import preprocessing_fncs as ppf
import elastic_search_fncs as esf

In [12]:
# Details of the dataset
db_host = 'https://athena.london.gov.uk'
db_user = 'odbc_readonly'
db_pass = 'odbc_readonly'
db_port = '10099'
db_name = 'gla-ldd-external'

# Creates connection to the dataset
es = Elasticsearch(
    [f"{db_host}:{db_port}"],
    basic_auth=(db_user, db_pass),
    verify_certs=True,
    ca_certs='athena_es_full_chain.crt'
)

# Check connection
if es.ping():
    print("Connected to Elasticsearch!")
else:
    print("Could not connect to Elasticsearch.")

Connected to Elasticsearch!


In [None]:
years_20_df = [] # save the data into this dataframe

for year in range(2020, 2023):  # 2015–2019
    query = {
        "query": {
            "bool": {
                # conditions that must be met
                "must": [ 
                    {
                        "range": {
                            "valid_date": {
                                "gte": f"01/01/{year}",
                                "lt": f"01/01/{year + 1}"
                            }
                        }
                    }
                ],
            }
        },
        "_source": [
            "valid_date",
            "decision_date",
            "borough",
            "application_details.residential_details.total_no_existing_residential_units",
            "application_details.residential_details.total_no_proposed_residential_units",
            "street_name",
            "site_name",
            "polygon", 
            "wgs84_polygon", # geo
            "description" # main target
        ]
    }
    
    # Elasticsearch query
    response = es.search(index="applications", body=query, scroll="2m", size=10000)
    scroll_id = response['_scroll_id']
    hits = response['hits']['hits']

    all_hits = []
    all_hits.extend(hits)

    while len(hits) > 0:
        response = es.scroll(scroll_id=scroll_id, scroll="2m")
        scroll_id = response['_scroll_id']
        hits = response['hits']['hits']
        all_hits.extend(hits)

    df_raw = pd.json_normalize(all_hits)
    df_cleaned = ppf.format_df(df_raw)
    df_cleaned['year'] = year

    years_20_df.append(df_cleaned)

# combined all the data
df_20_22_all = pd.concat(years_20_df, ignore_index=True)

  response = es.search(index="applications", body=query)


KeyboardInterrupt: 

In [None]:
print(df_20_22_all.shape)
df_20_22_all.to_csv("cleaned_projects_20_22.csv", index=False)

In [None]:
# import torch
# from tqdm import tqdm
# from sentence_transformers import SentenceTransformer
# from nltk.tokenize import sent_tokenize
# import joblib
# import os
# import nltk
# nltk.download('punkt')      # Normal Sentence Segmentation Model
# nltk.download('punkt_tab')  

In [None]:
# # reconfirm the columns
# df_20_22_all.columns 

In [None]:
# # set a new copy
# df = df_20_22_all.copy()
# # keep only the non-empty text
# df = df[df['description'].notna()]
# df['description'] = df['description'].str.replace(r'\s+', ' ', regex=True).str.strip()
# df['sentences'] = df['description'].apply(sent_tokenize)
# all_sentences = df['sentences'].explode().tolist()