General comments about changes to support xgboost model files:

-Environment changes needed: pandas -> 1.5.2, xgboost -> 1.5.1

-Load and predict through xgboost library

-Check ok to delete country_iso & row_ids from features for prediction

-Confirm same vectorizer for both model types

-Need to handle output differently

In [41]:
from instant_match.loader.assets_loader import extract_model_to_gcs, download_gcs_folder_to_local

In [42]:
dataset_name="achilles"
model_name="model_TW_xgboost_v5_geo"
gcs_bucket="dh-gsd-achilles-dev"
gcs_path="xgboost_model_test"

extract_model_to_gcs(dataset_name=dataset_name, model_name=model_name, gcs_bucket=gcs_bucket, gcs_path=gcs_path)

Model model_TW_xgboost_v5_geo extracted to gs://dh-gsd-achilles-dev/xgboost_model_test successfully


In [43]:
gcs_bucket="dh-gsd-achilles-dev"
gcs_folder_path="xgboost_model_test/model_TW_xgboost_v5_geo"
local_folder_path="."

download_gcs_folder_to_local(gcs_bucket=gcs_bucket, gcs_folder_path=gcs_folder_path, local_folder_path=local_folder_path)

xgboost_model_test/model_TW_xgboost_v5_geo/model.json
Downloaded xgboost_model_test/model_TW_xgboost_v5_geo/model.json to .//model.json


In [45]:
import xgboost as xgb

model = xgb.Booster()
model.load_model('model.json')

In [46]:
model

<xgboost.core.Booster at 0x30bc18070>

In [47]:
import pandas as pd
import pandas_gbq
from google.oauth2 import service_account

credentials = service_account.Credentials.from_service_account_file(
    '../../../secret.json',
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

In [48]:
query = f"""
select * 
from `dh-global-sales-data-dev.leadgen_sf_match_vertex_raw.candidates_FP_TW`
limit 5000
"""

df_candidates = pandas_gbq.read_gbq(query, project_id=credentials.project_id, credentials=credentials)

Downloading: 100%|[32m█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████[0m|[0m


In [49]:
df_candidates.columns

Index(['country_iso', 'left_row_id', 'left_name', 'left_name_local',
       'left_name_stop', 'left_name_stop_phonetic',
       'left_name_local_transliterated', 'left_street', 'left_street_stop',
       'left_street_stop_phonetic', 'left_phone_number', 'left_lat',
       'left_lng', 'right_row_id', 'right_name', 'right_name_local',
       'right_name_legal', 'right_name_stop', 'right_name_stop_phonetic',
       'right_name_local_transliterated', 'right_street', 'right_street_stop',
       'right_street_stop_phonetic', 'right_phone_number', 'right_lat',
       'right_lng', 'haversine', 'tokenset_name_stop', 'tokenset_street_stop',
       'tokenset_name_local', 'tokenset_name_local_transliterated',
       'tokenset_name_legal'],
      dtype='object')

In [50]:
from instant_match import INSTANT_MATCH_ROOT

model_id = "TW"
model_version = "5"

model_catalogue = json.loads(
    (
        INSTANT_MATCH_ROOT
        / "instant_match/instant_match/assets/model_catalogue.json"
    ).read_text()
)

key = f"{model_id}_{model_version}"

model_config = model_catalogue[key]

In [51]:
model_config

{'model_type': 'geo',
 'model_name': 'TW',
 'model_version': '5',
 'features': ['haversine',
  'jaro_winkler_name_local',
  'jaro_winkler_name_local_transliterated',
  'jw_name_local_nonascii_only',
  'levenshtein_street_stop_phonetic',
  'same_phone',
  'tokenset_name_local_ascii_only',
  'tokenset_name_local_nonascii_only',
  'tokenset_name_local_transliterated',
  'tokenset_name_stop',
  'tokenset_street_number',
  'wratio_name',
  'wratio_street_number'],
 'threshold': 0.6,
 'date_modified': 1692921600000,
 'model_source': 'XGBoost'}

In [52]:
_COLS_TO_ADD = [
    "country_iso",
    "left_row_id",
    "right_row_id",
]

features = _COLS_TO_ADD + model_config["features"] 

In [53]:
from instant_match.loader.startup import get_vectorizer_path

local_vectorizer_path = get_vectorizer_path(model_id, model_version)

In [54]:
from match.achilles.model import Achilles

feature_pipeline = Achilles(
        model_id=model_id,
        model_version=model_version,
        local_vectoriser_path=local_vectorizer_path,
        feature_list=features,
    )

In [55]:
from instant_match.components import (
    build_features
)

In [61]:
features_df = build_features(df_candidates, feature_pipeline)

2024-11-25 18:01:22,157 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,173 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,182 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,214 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,223 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,232 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,256 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,267 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDistanceMaker'>
2024-11-25 18:01:22,274 - match.achilles - DEBUG - <class 'match.achilles.feature.HaversineDistance'>
2024-11-25 18:01:22,277 - match.achilles - DEBUG - <class 'match.achilles.feature.RowDista

In [64]:
features_df = features_df.drop(columns=["country_iso", "left_row_id", "right_row_id"])

In [65]:
features_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 13 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   haversine                               4088 non-null   float64
 1   jaro_winkler_name_local                 5000 non-null   float64
 2   jaro_winkler_name_local_transliterated  5000 non-null   float64
 3   jw_name_local_nonascii_only             5000 non-null   float64
 4   levenshtein_street_stop_phonetic        4997 non-null   float64
 5   same_phone                              5000 non-null   int64  
 6   tokenset_name_local_ascii_only          5000 non-null   float64
 7   tokenset_name_local_nonascii_only       5000 non-null   float64
 8   tokenset_name_local_transliterated      5000 non-null   float64
 9   tokenset_name_stop                      5000 non-null   float64
 10  tokenset_street_number                  1810 non-null   floa

In [66]:
dtest = xgb.DMatrix(features_df)
predictions = model.predict(dtest)

In [60]:
predictions

array([0.00617153, 0.02306852, 0.03042689, ..., 0.00750858, 0.00250992,
       0.00282061], dtype=float32)

In [68]:
# Now check if Google's predictor module can load xgboost model file

from instant_match.bq_predictor import Predictor

test = Predictor.from_path(".")

XGBoostError: [18:13:07] /private/var/folders/pt/q9f5cng91njf3hmhhhn0_7mr0000gq/T/pip-install-vjh31kpp/xgboost_f7497d8f1908443789cd208fc7a1b4d2/build/temp.macosx-11.1-arm64-cpython-39/xgboost/dmlc-core/src/io/local_filesys.cc:209: Check failed: allow_null:  LocalFileSystem::Open "./model.bst": No such file or directory
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000303ee97b4 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000304089658 dmlc::io::LocalFileSystem::Open(dmlc::io::URI const&, char const*, bool) + 788
  [bt] (2) 3   libxgboost.dylib                    0x00000003040725d8 dmlc::Stream::Create(char const*, char const*, bool) + 68
  [bt] (3) 4   libxgboost.dylib                    0x0000000303ee3dec XGBoosterLoadModel + 364
  [bt] (4) 5   libffi.8.dylib                      0x00000001032ec04c ffi_call_SYSV + 76
  [bt] (5) 6   libffi.8.dylib                      0x00000001032e9834 ffi_call_int + 1404
  [bt] (6) 7   _ctypes.cpython-39-darwin.so        0x000000010331c10c _ctypes_callproc + 792
  [bt] (7) 8   _ctypes.cpython-39-darwin.so        0x0000000103316a58 PyCFuncPtr_call + 236
  [bt] (8) 9   python3.9                           0x0000000102abd2d0 _PyEval_EvalFrameDefault + 54472

