# Text based
- add label to text
- set anchor text
- cosine similarity
- cross validation/AUC

In [20]:
import geopandas as gpd
import numpy as np
import pandas as pd
from shapely.geometry import Polygon
from shapely.geometry import Point
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

## Gentrified Texts

In [None]:
# merge the gentri label to the vectorization text
# use 'polygon' or 'wgs84_polygon', depend on what coordinate the label is
# load gentri label
lsoa_label = gpd.read_file("data/gentri_data/london_gentri_labeled.shp")
lsoa_label.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [5]:
# use the wgs84_polygon to join the label with text
# read the text
text = pd.read_parquet("sbert_encoded_data.parquet")
print(text.columns)

Index(['site_name', 'decision_date', 'valid_date', 'polygon.geometries',
       'polygon.type', 'wgs84_polygon.coordinates', 'wgs84_polygon.type',
       'description', 'borough', 'street_name',
       ...
       'sbert_374', 'sbert_375', 'sbert_376', 'sbert_377', 'sbert_378',
       'sbert_379', 'sbert_380', 'sbert_381', 'sbert_382', 'sbert_383'],
      dtype='object', length=401)


In [9]:
# turn polygon coordinate into shapely 
# make sure to drop the null geodata
text_valid = text[text['wgs84_polygon.coordinates'].notnull()].copy()
text_valid["geometry"] = text_valid["wgs84_polygon.coordinates"].apply(lambda coords: Polygon(coords[0]))

# creat GeoDataFrame， set WGS84 coordinate
gdf_text = gpd.GeoDataFrame(text_valid, geometry="geometry", crs="EPSG:4326")

In [10]:
joined = gpd.sjoin(gdf_text, lsoa_label[["geometry", "gentrified"]], how="left", predicate="within")

In [15]:
print("the percentage of texts being labeled:", joined['gentrified'].notna().mean())

the percentage of texts being labeled: 0.8843049615743271


Noramlly all the text should be labeled, so this might have two causes.
- set the wrong coordinate
- text's polygon cross more than one lsoa  

In [None]:
# check the geometry
gdf_unmatched = joined[joined['gentrified'].isna()]
gdf_unmatched = gdf_unmatched.to_crs("EPSG:4326")
print(gdf_unmatched.geometry.centroid.head())

2     POINT (-0.20623 51.57753)
9     POINT (-0.10164 51.51549)
15    POINT (-0.09637 51.53113)
19    POINT (-0.10233 51.53794)
60    POINT (-0.12107 51.56635)
dtype: geometry



  print(gdf_unmatched.geometry.centroid.head())


In [None]:
# check the texts
print("total texts:", len(gdf_text))
print("text within lsoa:", joined['gentrified'].notna().sum())

total texts: 155495
text within lsoa: 137505


Geometry seems fine, and not all texts are within lsoas, meaning some text polygon cross over more than one lsoas.

Two solutions:
1. drop the texts that cross several lsoas
2. change within to intersects, but this would lead to one text have more than one label

For now i choose to drop 

In [17]:
joined_labeled = joined.dropna(subset=['gentrified']).copy()

In [19]:
joined_labeled['gentrified'].value_counts(normalize=True)

gentrified
False    0.786488
True     0.213512
Name: proportion, dtype: float64

The percentage of gentrified texts (0.21) are similar with the gentrified lsoas (0.24).
Here I tried a baseline model (without anchor text).

In [21]:
joined_labeled.columns

Index(['site_name', 'decision_date', 'valid_date', 'polygon.geometries',
       'polygon.type', 'wgs84_polygon.coordinates', 'wgs84_polygon.type',
       'description', 'borough', 'street_name',
       ...
       'sbert_377', 'sbert_378', 'sbert_379', 'sbert_380', 'sbert_381',
       'sbert_382', 'sbert_383', 'geometry', 'index_right', 'gentrified'],
      dtype='object', length=404)

In [28]:
# convert the SBERT list vectors into numpy arrays 
# select all SBERT columns
sbert_cols = [col for col in joined_labeled.columns if col.startswith("sbert_")]

# combine these columns into a matrix X (with N rows and 384 columns)
X = joined_labeled[sbert_cols].values  # shape = (N, 384)
y = joined_labeled["gentrified"].astype(int).values

# set train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
# initialize the model and handle the imbalance using class_weight
model = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# prediction and evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]

print("=== Classification Report ===")
print(classification_report(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_prob))

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.83      0.63      0.71     21629
           1       0.27      0.52      0.36      5872

    accuracy                           0.60     27501
   macro avg       0.55      0.57      0.54     27501
weighted avg       0.71      0.60      0.64     27501

AUC Score: 0.6088463358370781


The model has some predicting capabilities (with an AUC value slightly over 0.6), but its ability to predict gentrified = 1 is still insufficient.

## Anchor texts