In [21]:
import pandas as pd
import numpy as np
import iris
import time
import os
from sentence_transformers import SentenceTransformer

Get database cursor

In [22]:
username = 'demo'
password = 'demo'
hostname = os.getenv('IRIS_HOSTNAME', 'localhost')
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"{hostname}:{port}/{namespace}"
print(CONNECTION_STRING)
conn = iris.connect(CONNECTION_STRING, username, password)
cursor = conn.cursor()

localhost:1972/USER


In [40]:
vectorizer = SentenceTransformer('all-MiniLM-L6-v2')
searchPhraseHigh = "violent aggresor and sexual assault mugged on street, arson, harassment"
searchPhraseMid = "pickpocket, anti social loitering, loud disturbance"
searchPhraseLow = "house robbery, loud disturbance, vandalism, drug"

tableName = "SafeRoute.CrimeDataSample"
searchVectorHigh = vectorizer.encode(searchPhraseHigh, normalize_embeddings=True).tolist()
# Define the SQL query with placeholders for the vector and limit
sql = f"""
    SELECT TOP ? ContextVector
    FROM {tableName}
    ORDER BY VECTOR_DOT_PRODUCT(ContextVector, TO_VECTOR(?)) DESC
    """

numberOfResults = 100

cursor.execute(sql, [numberOfResults, str(searchVectorHigh)])
results = cursor.fetchall()
print(results[0])

('.068094633519649505616,.077210649847984313964,.0081191509962081909179,.076431825757026672363,.018038064241409301757,.087142668664455413818,-.030162297189235687256,.0075939875096082687377,.015243197791278362274,.038865014910697937011,.075689397752285003662,.010351593606173992156,-.0042481580749154090881,-.012982134707272052764,.059658221900463104248,-.064257353544235229492,.063087590038776397706,-.042173355817794799804,-.024559341371059417724,.027928924188017845153,.045776147395372390747,-.0040774010121822357177,.12102079391479492187,-.037675686180591583251,-.084656938910484313964,-.031777221709489822387,.0097643351182341575622,.063562840223312377929,-.038751617074012756347,.039677679538726806641,-.010945607908070087432,.055890697985887527466,.081395626068115234375,.067734926939010620117,-.029590306803584098816,-.061009660363197326661,.015857379883527755737,.019118100404739379882,.11675021052360534667,-.0025196834467351436614,.033485267311334609986,-.079512991011142730712,.04476225748

In [41]:
# Convert each row's vector string into a list of floats, then to a NumPy array.
vector_list = [
    np.array([float(x) for x in row[0].split(',')], dtype=np.float32)
    for row in results
]
# Stack the individual 1D arrays (vectors) into a 2D array.
transformedResults = np.vstack(vector_list)
# transformedResults = np.array(results, dtype=np.float32)
print(transformedResults[0])


[ 6.80946335e-02  7.72106498e-02  8.11915100e-03  7.64318258e-02
  1.80380642e-02  8.71426687e-02 -3.01622972e-02  7.59398751e-03
  1.52431978e-02  3.88650149e-02  7.56893978e-02  1.03515936e-02
 -4.24815807e-03 -1.29821347e-02  5.96582219e-02 -6.42573535e-02
  6.30875900e-02 -4.21733558e-02 -2.45593414e-02  2.79289242e-02
  4.57761474e-02 -4.07740101e-03  1.21020794e-01 -3.76756862e-02
 -8.46569389e-02 -3.17772217e-02  9.76433512e-03  6.35628402e-02
 -3.87516171e-02  3.96776795e-02 -1.09456079e-02  5.58906980e-02
  8.13956261e-02  6.77349269e-02 -2.95903068e-02 -6.10096604e-02
  1.58573799e-02  1.91181004e-02  1.16750211e-01 -2.51968345e-03
  3.34852673e-02 -7.95129910e-02  4.47622575e-02 -1.17922481e-02
 -4.04903144e-02  4.07044962e-02 -1.95300800e-03  3.43534537e-03
 -2.86006257e-02 -8.06367546e-02 -2.81627309e-02  6.06175996e-02
  6.44660965e-02  4.16093022e-02 -4.72222418e-02 -8.45388323e-02
  2.74485326e-03 -2.47324426e-02  4.99067754e-02  4.98857396e-03
  3.62810353e-03  7.51898

In [42]:
scoreDistLow = .7
scoreDistHigh = .9

y = np.random.uniform(
  scoreDistLow,
  scoreDistHigh, 
  size=(transformedResults.shape[0])
  ).astype(np.float32)

print(y)

[0.7163586  0.8116356  0.7397518  0.71133196 0.8404099  0.8885317
 0.77483684 0.846201   0.86440974 0.84002036 0.8581841  0.8235139
 0.8909077  0.76235753 0.797786   0.7895199  0.8568604  0.7101625
 0.7432157  0.70389146 0.8542718  0.8588161  0.715536   0.7636923
 0.7556209  0.8072867  0.86376566 0.8638723  0.75317174 0.70326054
 0.75147206 0.73916084 0.78645945 0.8921305  0.8373218  0.872711
 0.8474694  0.891756   0.87183386 0.74259776 0.7493658  0.73323375
 0.8483808  0.859982   0.7002381  0.72958255 0.873878   0.74976903
 0.8751281  0.79249984 0.75518405 0.73936576 0.72355086 0.8841522
 0.7016893  0.71281576 0.89891905 0.8448755  0.7070998  0.7433609
 0.8319262  0.8275122  0.7233414  0.7554847  0.74547464 0.7507219
 0.83875775 0.8293309  0.814802   0.8430221  0.7017178  0.7288917
 0.861541   0.8005675  0.85442674 0.8351977  0.7805335  0.7570866
 0.73067397 0.7195586  0.85607976 0.7827006  0.8142853  0.73403454
 0.71212286 0.80767065 0.8181821  0.8346734  0.8933896  0.761099
 0.87030

In [26]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable


In [53]:

import xgboost

model = xgboost.XGBRegressor (
  objective = "reg:squarederror",
  n_estimators = 384,
  max_depth = 4,
  learning_rate = 0.1,
  random_state = 42
)

model.fit(transformedResults, y)

# test = "sexual mugging with arson and blue jays"
# test = "walking steadily. Light. Boring."
test = ""
res = vectorizer.encode(test, normalize_embeddings=True)
res = np.array(res, dtype=np.float32).reshape(1, -1)

print(model.predict(res))




[0.78584534]
