Final Project: Fuzzy Matching

In [68]:
project_id = "group-5-448704"
dataset = "fin_player_dedup"
region = "us-central1"
connection_id = "vertex-connection"
embedding_model = "text-embedding-005"
gemini_model = "gemini-2.5-flash-preview-04-17"


Part 1: Setup

Create BQ dataset

In [69]:
from google.cloud import bigquery
bq_client = bigquery.Client()
dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
bq_client.create_dataset(dataset_id, exists_ok=True)

Dataset(DatasetReference('group-5-448704', 'fin_player_dedup'))

Create a connection resource

In [None]:
!bq mk --connection --location=$region --project_id=$project_id \
    --connection_type=CLOUD_RESOURCE $connection_id

BigQuery error in mk operation: Already Exists: Connection projects/746118679557/locations/us-
central1/connections/vertex-connection


In [None]:
!bq show --connection 746118679557.us-central1.vertex-connection

Connection 746118679557.us-central1.vertex-connection

                     name                      friendlyName   description    Last modified         type        hasCredential                                            properties                                            
 -------------------------------------------- -------------- ------------- ----------------- ---------------- --------------- ----------------------------------------------------------------------------------------------- 
  746118679557.us-central1.vertex-connection                                30 Apr 22:25:08   CLOUD_RESOURCE   False           {"serviceAccountId": "bqcx-746118679557-62yt@gcp-sa-bigquery-condel.iam.gserviceaccount.com"}  



In [None]:
!gcloud projects add-iam-policy-binding $project_id --member='serviceAccount:bqcx-746118679557-62yt@gcp-sa-bigquery-condel.iam.gserviceaccount.com' \
  --role='roles/aiplatform.user' --no-user-output-enabled

In [70]:
%%bigquery
CREATE OR REPLACE MODEL fin_player_dedup.embedding_model
REMOTE WITH CONNECTION `projects/group-5-448704/locations/us-central1/connections/vertex-connection`
OPTIONS (endpoint = 'text-embedding-005');


Query is running:   0%|          |

Sample input data

In [None]:
%%bigquery
SELECT * FROM football_dataset_stg.renamed_2024_player_predictions
ORDER BY player_name;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,player_name,height,weight,college,_data_source,_load_time
0,A.J. Brown,73,226,Mississippi,Kaggle,2025-02-07 02:08:07.176792+00:00
1,A.J. Brown,73,226,Mississippi,Kaggle,2025-02-07 03:02:59.850039+00:00
2,Aaron Jones,69,208,UTEP,Kaggle,2025-02-07 02:08:07.176792+00:00
3,Aaron Jones,69,208,UTEP,Kaggle,2025-02-07 03:02:59.850039+00:00
4,Aaron Rodgers,74,223,California,Kaggle,2025-02-07 02:08:07.176792+00:00
...,...,...,...,...,...,...
347,Zamir White,72,215,Georgia,Kaggle,2025-02-07 03:02:59.850039+00:00
348,Zay Flowers,69,175,Boston College,Kaggle,2025-02-07 02:08:07.176792+00:00
349,Zay Flowers,69,175,Boston College,Kaggle,2025-02-07 03:02:59.850039+00:00
350,Zay Jones,74,200,East Carolina,Kaggle,2025-02-07 02:08:07.176792+00:00


Create table with uuid since original table does not have uuid

In [None]:
%%bigquery
CREATE OR REPLACE TABLE football_dataset_stg.renamed_2024_player_predictions_with_uuid AS
SELECT
  GENERATE_UUID() AS uuid,
  *
FROM football_dataset_stg.renamed_2024_player_predictions;

Query is running:   0%|          |

Part 2: Create the embeddings

Create the embeddings on the football player's name, height, weight, and college.

In [None]:
%%bigquery

CREATE OR REPLACE TABLE fin_player_dedup.player_embeddings AS (
  WITH player_content AS (
    SELECT
      uuid,
      CONCAT(player_name, ' height: ', height, ' weight: ', weight, ' college: ', college) AS content
    FROM football_dataset_stg.renamed_2024_player_predictions_with_uuid
  )
  SELECT
    uuid,
    content,
    ml_generate_embedding_result AS embedding
  FROM
    ML.GENERATE_EMBEDDING(
      MODEL fin_player_dedup.embedding_model,
      (SELECT uuid, content FROM player_content WHERE content IS NOT NULL),
      STRUCT('CLUSTERING' AS task_type)
    )
);



Query is running:   0%|          |

In [None]:
%%bigquery
SELECT * FROM fin_player_dedup.player_embeddings

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uuid,content,embedding
0,1c90ac62-cbfe-4514-808e-53d98ac9addc,A.J. Brown height: 73 weight: 226 college: Mis...,"[-0.005469650495797396, 0.0002118218399118632,..."
1,b29a3f62-7166-4ece-9bbc-44c3c0ff7f09,A.J. Brown height: 73 weight: 226 college: Mis...,"[-0.005469650495797396, 0.0002118218399118632,..."
2,613d888f-4682-4d90-9924-b35c8ae64ef9,Aaron Jones height: 69 weight: 208 college: UTEP,"[-0.024767093360424042, 0.010256818495690823, ..."
3,5a40432b-8282-4614-a934-d42466ae5c03,Aaron Jones height: 69 weight: 208 college: UTEP,"[-0.024767093360424042, 0.010256818495690823, ..."
4,bccceb6b-b8e2-4837-846f-e7a3cc8aee11,Aaron Rodgers height: 74 weight: 223 college: ...,"[-0.005709583405405283, 0.05527830123901367, -..."
...,...,...,...
347,2c91c95e-7193-49e3-8753-6f61b64e57eb,Zamir White height: 72 weight: 215 college: Ge...,"[-0.012699887156486511, 0.0005488800234161317,..."
348,20a34d57-3d0c-4fa2-b9d2-784c55e7150f,Zay Flowers height: 69 weight: 175 college: Bo...,"[-0.029004564508795738, 0.028970832005143166, ..."
349,446d9cd5-8622-4f99-87b0-4ff9444b5bd9,Zay Flowers height: 69 weight: 175 college: Bo...,"[-0.029004564508795738, 0.028970832005143166, ..."
350,35ddbe73-2fe7-40ce-86f2-b82c1e0709f1,Zay Jones height: 74 weight: 200 college: East...,"[-0.026201268658041954, 0.04591625928878784, 0..."


Part 3: Find the nearest neighbors based on cosine distance

In [None]:
%%bigquery

CREATE OR REPLACE TABLE fin_player_dedup.nearest_neighbors AS
SELECT
  query.uuid AS uuid,
  base.uuid AS nearest_neighbor,
  distance
FROM
  VECTOR_SEARCH(
    TABLE fin_player_dedup.player_embeddings,
    'embedding',
    TABLE fin_player_dedup.player_embeddings,
    'embedding',
    TOP_K => 2,
    DISTANCE_TYPE => 'COSINE')
WHERE query.uuid != base.uuid
ORDER BY distance;



Query is running:   0%|          |

Since our dataset has exact duplicate rows for each player, our distance is equal to 0.

In [72]:
%%bigquery

SELECT * FROM fin_player_dedup.nearest_neighbors
WHERE distance <= 0.3
ORDER BY distance;



Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uuid,nearest_neighbor,distance
0,e9dc6aea-a003-474b-9630-e7e4741e5c50,035c0974-c756-4950-90b0-2420ba7c88cf,0.0
1,94cfaa7c-0052-47f9-8737-1acea7f8abe9,1a10959c-d182-4914-ae27-563c49dd6d82,0.0
2,9a52bfbb-8c73-4ac4-9058-77add930de4b,e23a4331-e032-4a68-a945-ec21369f17e3,0.0
3,92de37f2-7935-472f-ba5a-78f28b701222,4177cb87-73a1-4787-8328-264fd0c445a0,0.0
4,81ca1854-12e2-4ff6-97f6-11c1b6d5fa50,ab8711c6-f0dd-4147-ac7b-3a1ecaa6abdd,0.0
...,...,...,...
347,74ad78d5-cdc7-4bd5-9f55-b7b5071b504b,948dac8e-119c-4c69-86e0-3e0e727e267e,0.0
348,0e22ba29-7b77-4fd3-b719-2602088ea3ca,b60f548d-164e-4b39-8ba5-d270b3262bfd,0.0
349,e0d92b22-c6f3-456e-8e09-c953027448fd,24ce91cb-08a1-4306-baa3-f3ebd2c96817,0.0
350,fbfdc2c2-db21-4cd0-a081-caf46143bc51,be1a02a9-d63a-4394-9a3d-97c50a8a3de5,0.0


Part 4: Assign unique cluster ids to the pairs of nearest neighbors which fall within our distance threshold

In [None]:
from google.cloud import bigquery
import pandas as pd
import pandas_gbq

project_id = "group-5-448704"
dataset = "fin_player_dedup"

query = f"""
SELECT uuid, nearest_neighbor
FROM `{project_id}.{dataset}.nearest_neighbors`
WHERE distance <= 0.01
"""

bq_client = bigquery.Client(project=project_id)
rows = bq_client.query(query).result()

cluster_id = 0
seen = set()
output = []

for row in rows:
    u1, u2 = row["uuid"], row["nearest_neighbor"]
    if u1 not in seen:
        cluster_id += 1
        seen.add(u2)
        output.append((u1, cluster_id))
        output.append((u2, cluster_id))

df = pd.DataFrame(output, columns=["uuid", "cluster_id"])
pandas_gbq.to_gbq(df, f"{dataset}.clusters", project_id=project_id, if_exists="replace")


100%|██████████| 1/1 [00:00<00:00, 6403.52it/s]


Part 5: Rank the names within each cluster to select the ones to keep

In [None]:
import json
import vertexai
from vertexai.generative_models import GenerativeModel

query = """
SELECT c.cluster_id, p.*
FROM fin_player_dedup.clusters c
JOIN football_dataset_stg.renamed_2024_player_predictions_with_uuid p
  ON c.uuid = p.uuid
ORDER BY c.cluster_id;
"""

prompt = """Select the most complete and accurate player record.
Return as JSON: {"uuid": "uuid-value"}. No explanations.
"""

bq_client = bigquery.Client()
rows = bq_client.query(query).result()

vertexai.init(project=project_id, location="us-central1")
model = GenerativeModel("gemini-2.5-flash-preview-04-17")

combined_results = []
current_cluster = None
pair_list = []

for row in rows:
    cluster_id = row["cluster_id"]
    record = {
        "uuid": row["uuid"],
        "player_name": row["player_name"],
        "height": row["height"],
        "weight": row["weight"],
        "college": row["college"]
    }

    pair_list.append(json.dumps(record))

    if cluster_id != current_cluster and current_cluster is not None:
        input_str = ",".join(pair_list)
        try:
            resp = model.generate_content([input_str, prompt])
            clean = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
            parsed = json.loads(clean)
            combined_results.append(parsed)
        except Exception as e:
            print("⚠️ Gemini failed:", e)
        pair_list = []

    current_cluster = cluster_id

# Final cluster
if pair_list:
    input_str = ",".join(pair_list)
    try:
        resp = model.generate_content([input_str, prompt])
        clean = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
        parsed = json.loads(clean)
        combined_results.append(parsed)
    except Exception as e:
        print("⚠️ Final Gemini failed:", e)

# Upload results
df = pd.DataFrame(combined_results)
pandas_gbq.to_gbq(df, f"{dataset}.player_keep", project_id=project_id, if_exists="replace")


100%|██████████| 1/1 [00:00<00:00, 2788.77it/s]


Compute the list of names to discard

In [None]:
%%bigquery

-- Find discarded records
CREATE OR REPLACE TABLE fin_player_dedup.player_discard AS
SELECT uuid FROM fin_player_dedup.clusters
EXCEPT DISTINCT
SELECT uuid FROM fin_player_dedup.player_keep;

-- Final deduplicated player table
CREATE OR REPLACE TABLE fin_player_dedup.player_final AS
SELECT * FROM football_dataset_stg.renamed_2024_player_predictions_with_uuid
WHERE uuid NOT IN (
  SELECT uuid FROM fin_player_dedup.player_discard
);


Query is running:   0%|          |

Conclusion: Using embeddings and vector similarity, we effectively identified and removed duplicate player records that traditional string matching methods (e.g., GROUP BY) missed. This approach can capture subtle differences in player names and metadata. However, in our dataset it could be redundant, since we had exact duplicate rows for each player. Thus, in this specific case, we would not incorporate this into the int layer since there would be much easier methods in getting rid of exact duplicates. But, the use of embeddings could be very effective if we used it on other tables that had inconsistent naming