In [None]:
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely.geometry import Point
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
# merge the gentri label to the vectorization text
# use 'polygon' or 'wgs84_polygon', depend on what coordinate the label is
# load gentri label
lsoa_label = gpd.read_file("data/gentri_data/london_gentri_labeled_25_only.shp")
lsoa_label.crs

In [None]:
stru_data = pd.read_csv("data/all_stru_data_added.csv")
stru_data = stru_data.rename(columns={'LSOA Code': 'LSOA code'})
lsoa_label = lsoa_label.merge(stru_data, on= "LSOA code", how="left")
text = pd.read_parquet("sbert_encoded_data.parquet")
# turn polygon coordinate into shapely 
# make sure to drop the null geodata
text_valid = text[text['wgs84_polygon.coordinates'].notnull()].copy()
text_valid["geometry"] = text_valid["wgs84_polygon.coordinates"].apply(lambda coords: Polygon(coords[0]))
# creat GeoDataFrame， set WGS84 coordinate
gdf_text = gpd.GeoDataFrame(text_valid, geometry="geometry", crs="EPSG:4326")
# set to 27700
gdf_27700 = gdf_text.to_crs("EPSG:27700")
joined = gpd.sjoin(gdf_27700, lsoa_label[["LSOA code", "geometry", "gentrified", "pop_growth_rate",  
                                          "avg_den", "senior_per", "minority_per",
                                          'all_dwelling', 'shared_per', 'converted_per',
                                          'level_4_pop', 'level_4_per',
                                          'prof_occ_count', 'prof_occ_per',
                                          'sale_avg',
                                          'single_per',
                                          'less_10km_per']], 
                                          how="left", predicate="within")
# get the centroid of each polygon
gdf_27700["centroid"] = gdf_27700.geometry.centroid
# spatial join
joined_centroid = gpd.sjoin(gdf_27700.set_geometry("centroid"), lsoa_label[["LSOA code", "geometry", 
                                                                            "gentrified", 
                                                                            "avg_den", "senior_per", "minority_per"]],
                                                                            how="left",predicate="within")
joined_cleaned = joined_centroid[joined_centroid["gentrified"].notna()].copy()
joined_cleaned.columns
df_poi = pd.read_csv("data/poi_15_19.csv")
final_df = joined_cleaned.merge(df_poi[['LSOA code','shop_increase','metro_increase','bus_increase']], on="LSOA code", how="left")