## The purprose of this notebook is to research on sql fine tuned embedding models from hugging face
* The main objective is to assess how well this model can provide an accurate cosine similarity score between two sql queries.
* We will compare this with continuous_eval sql metrics score and SequenceMatcher from difflib 

In [1]:
from typing import Dict, List
from difflib import SequenceMatcher
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from continuous_eval.metrics.code.sql import SQLASTSimilarity
from sqlglot import parse_one
from sqlglot.diff import diff

In [2]:
embedding_model_a = SentenceTransformer("s2593817/sft-sql-embedding")
embedding_model_b = SentenceTransformer("RaduGabriel/BGE-M3-SQL")
embedding_model_c = SentenceTransformer("dat-ai/bge-base-for_text2sql")
SQLAST_METRIC = SQLASTSimilarity()

In [3]:
def calculate_cosine_similarity(query_a:str, query_b:str, embedding_model: SentenceTransformer) -> float:
    embeddings = embedding_model.encode([query_a, query_b])
    cosine_similarity_score = embedding_model.similarity(embeddings, embeddings)
    return cosine_similarity_score[0][1]

def calculate_sequence_matcher_similarity(query_a:str, query_b:str) -> float:
    return SequenceMatcher(None, query_a, query_b).ratio()

def calculate_sql_ast_similarity(query_a: str, query_b: str) -> float:
    return SQLAST_METRIC.compute(answer=query_a, ground_truth_answers=query_b)["SQL_AST_Similarity"]

def compare_scores(queries: Dict[str, Dict[str, List[str]]],
                   embedding_models: Dict[str, SentenceTransformer]
                  ) -> pd.DataFrame:
    dfs = []
    for model_name, embedding_model in embedding_models.items():
        cosine_scores = []
        seq_matcher_scores = []
        ast_scores = []
        categories = []
        sub_categories = []
        queries_a = []
        queries_b = []
        for category, sub_category_queries in queries.items():
            for sub_category, query_list in sub_category_queries.items():
                query_a, query_b = query_list
                cosine_similarity_score = calculate_cosine_similarity(query_a, query_b, embedding_model)
                sequence_matcher_score = calculate_sequence_matcher_similarity(query_a, query_b)
                sql_ast_metric = calculate_sql_ast_similarity(query_a, query_b)
                cosine_scores.append(cosine_similarity_score)
                seq_matcher_scores.append(sequence_matcher_score)
                ast_scores.append(sql_ast_metric)
                categories.append(category)
                sub_categories.append(sub_category)
                queries_a.append(query_a)
                queries_b.append(query_b)
        df = pd.DataFrame({
            "model_name": [model_name]*len(cosine_scores),
            "category": categories,
            "sub_categories": sub_categories,
            "cosine_scores": cosine_scores,
            "ast_scores": ast_scores,
            "seq_matcher_score": seq_matcher_scores,
            "query_a": query_a,
            "query_b": query_b
        })
        dfs.append(df)
    return pd.concat(dfs)

In [4]:
QUERIES  = {
    "DIFFERENT MEANING":{
        "DIFFERENT_QUERY_STRUCTURE":[
            """
                SELECT
                    age,
                    hire_date,
                    department,
                    tenure
                FROM
                    employee_table
                QUALIFY max(salary) over (partition by hire_date) = salary
            """,
            """
                SELECT
                    max(salary)
                FROM 
                    employee_table
                GROUP BY age
            """
        ],
        "logic_filter_comparison":[
            """
                SELECT
                    DISTINCT ACCOUNT_ID
                FROM 
                    transactions
                WHERE
                    TRANSACTION_DATE > '2025-07-01'
            """,
             """
                SELECT
                    DISTINCT ACCOUNT_ID
                FROM 
                    transactions
                WHERE
                    TRANSACTION_DATE < '2025-07-01'
            """ 
        ],
        "logic_filter_equal_check":[
            """
                SELECT
                    DISTINCT ACCOUNT_ID
                FROM 
                    transactions
                WHERE
                    TRANSACTION_DATE <> '2025-07-01'
            """,
             """
                SELECT
                    DISTINCT ACCOUNT_ID
                FROM 
                    transactions
                WHERE
                    TRANSACTION_DATE = '2025-07-01'
            """
        ],
        "col name changes": [
            """
                SELECT
                    department
                FROM
                    employee_table
                QUALIFY max(salary) over (partition by hire_date) = salary
            """,
            """
                SELECT
                    employee_name 
                FROM
                    employee_table
                QUALIFY max(hire_date) over (partition by department) = hire_date
            """
        ],
        "having versus where":[
            "SELECT department FROM employees WHERE salary > 100000",
            "SELECT department FROM employees GROUP BY department HAVING AVG(salary) > 100000"
        ],
        "difference joins":[
            "SELECT e.name, d.name FROM employees e INNER JOIN departments d ON e.dept_id = d.id",
            "SELECT e.name, d.name FROM employees e LEFT JOIN departments d ON e.dept_id = d.id"
        ]
    },
    "SAME MEAING": {
        "column order change only":[
            """
                SELECT 
                    log_date,
                    device_name,
                    department,
                    application_name,
                    device_id
                FROM
                    device_tracking
                ORDER BY log_date, device_id
            """,
            """
                SELECT 
                    device_id,
                    device_name,
                    application_name,
                    department,
                    log_date
                FROM
                    device_tracking
                ORDER BY device_id, log_date
            """
        ],
        "aliases":[
            """
                SELECT 
                    tbl.log_date as "log date",
                    tbl.device_name as "device name",
                    tbl.department,
                    tbl.application_name,
                    tbl.device_id as "device id"
                FROM
                    device_tracking as tbl
                GROUP BY 1,2,3,4,5
            """,
            """
                SELECT 
                    device_id,
                    device_name,
                    application_name,
                    department,
                    log_date
                FROM
                    device_tracking
                GROUP BY ALL
            """                
        ],
        "date function between":[
            """
                SELECT
                    customer_name,
                    create_date,
                    last_login_time
                FROM 
                    customers 
                WHERE 
                    last_login_time >= '2025-07-11' and last_login_time <= '2025-07-15'
            """,
            """
                SELECT
                    customer_name,
                    create_date,
                    last_login_time
                FROM 
                    customers 
                WHERE 
                    last_login_time BETWEEN '2025-07-11' and '2025-07-15'
            """
        ],
        "date function custom":[
            """
                SELECT
                    customer_name,
                    create_date,
                    last_login_time
                FROM 
                    customers 
                WHERE 
                    last_login_time >= TRUNC(SYSDATE) - 7
            """,
            """
                SELECT
                    customer_name,
                    create_date,
                    last_login_time
                FROM 
                    customers 
                WHERE 
                    last_login_time >= CURRENT_DATE - INTERVAL '7' DAY
            """
        ],
        "extract versus day/year":[
            """
                SELECT
                    customer_name,
                    YEAR(create_date) as year,
                    DAY(last_login_time) as DAY
                FROM 
                    customers 
            """,
            """
                SELECT
                    customer_name,
                    EXTRACT(YEAR FROM create_date) as year,
                    EXTRACT(DAY FROM last_login_time) as DAY
                FROM 
                    customers 
            """
        ],
        "regex and str":[
            """
                SELECT
                    LEFT(DeviceName, 7),
                    RIGHT(DeviceName, 3),
                    IpSource || '-' || IpTarget,
                    COUNT(*)
                FROM servers
                WHERE DeviceName Like '%HOME%' and IpSource != '172.16.14.16'
            """,
            """
                SELECT
                    SUBSTRING(DeviceName, 1, 7),
                    SUBSTRING(DeviceName, LEN(DeviceName)-2, LEN(DeviceName)),
                    CONCAT(IpSource, '-', IpTarget),
                    COUNT(1),
                FROM servers
                WHERE DeviceName REGEXP '%HOME%' and IpSource <> '172.16.14.16'
            """ 
        ],
        "qualify versus cte":[
            """
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM 
                    CUSTOMER_PURCHASES
                QUALIFY MAX(SIGN_UP_DATE) OVER (PARTITION BY STATE) = SIGN_UP_DATE
            """,
            """
                WITH stateData As (
                    SELECT
                        USER_ID,
                        SIGN_UP_DATE,
                        STATE,
                        PURCHASE,
                        ROW_NUMBER() OVER (PARTITION BY STATE) as row_number
                    FROM
                        CUSTOMER_PURCHASES        
                )
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM stateData
                WHERE row_number = 1
            """
        ],
        "cte versus subquery":[
            """
                WITH stateData As (
                    SELECT
                        USER_ID,
                        SIGN_UP_DATE,
                        STATE,
                        PURCHASE,
                        ROW_NUMBER() OVER (PARTITION BY STATE) as row_number
                    FROM
                        CUSTOMER_PURCHASES        
                )
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM stateData
                WHERE row_number = 1
            """,
            """
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM (
                    SELECT
                        USER_ID,
                        SIGN_UP_DATE,
                        STATE,
                        PURCHASE,
                        ROW_NUMBER() OVER (PARTITION BY STATE) as row_number
                    FROM
                        CUSTOMER_PURCHASES
                ) 
                WHERE row_number = 1
            """
        ],
        "qualify versus subquery":[
             """
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM 
                    CUSTOMER_PURCHASES
                QUALIFY MAX(SIGN_UP_DATE) OVER (PARTITION BY STATE) = SIGN_UP_DATE
            """,
            """
                SELECT
                    USER_ID,
                    SIGN_UP_DATE,
                    STATE,
                    PURCHASE
                FROM (
                    SELECT
                        USER_ID,
                        SIGN_UP_DATE,
                        STATE,
                        PURCHASE,
                        ROW_NUMBER() OVER (PARTITION BY STATE) as row_number
                    FROM
                        CUSTOMER_PURCHASES
                ) 
                WHERE row_number = 1
            """
        ],
        "over versus group by":[
            "SELECT department, MAX(salary) FROM employees GROUP BY department",
            "SELECT department, MAX(salary) OVER (PARTITION BY department) FROM employees"
        ],
        "join versus subquery":[
            "SELECT name FROM students WHERE class_id IN (SELECT id FROM classes WHERE teacher = 'Smith')",
            "SELECT name FROM students s LEFT JOIN classes c ON s.class_id = c.id WHERE c.teacher = 'Smith'"
        ]      
    }
}

EMBEDDING_MODELS = {
    "s2593817/sft-sql-embedding": embedding_model_a,
    "RaduGabriel/BGE-M3-SQL": embedding_model_b,
    "dat-ai/bge-base-for_text2sql": embedding_model_c    
}


In [5]:
result = compare_scores(
    queries=QUERIES,
    embedding_models=EMBEDDING_MODELS
)

### Display results by models

In [6]:
result[result.model_name == "s2593817/sft-sql-embedding"].sort_values(by=["category", "sub_categories"])

Unnamed: 0,model_name,category,sub_categories,cosine_scores,ast_scores,seq_matcher_score,query_a,query_b
0,s2593817/sft-sql-embedding,DIFFERENT MEANING,DIFFERENT_QUERY_STRUCTURE,tensor(0.2638),0.431818,0.637615,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
3,s2593817/sft-sql-embedding,DIFFERENT MEANING,col name changes,tensor(0.9711),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,s2593817/sft-sql-embedding,DIFFERENT MEANING,difference joins,tensor(0.9739),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
4,s2593817/sft-sql-embedding,DIFFERENT MEANING,having versus where,tensor(0.5572),0.6,0.746269,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,s2593817/sft-sql-embedding,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9975),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
2,s2593817/sft-sql-embedding,DIFFERENT MEANING,logic_filter_equal_check,tensor(0.9796),0.75,0.992701,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
7,s2593817/sft-sql-embedding,SAME MEAING,aliases,tensor(0.5448),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
6,s2593817/sft-sql-embedding,SAME MEAING,column order change only,tensor(0.9959),0.904762,0.476821,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
13,s2593817/sft-sql-embedding,SAME MEAING,cte versus subquery,tensor(0.9756),0.807692,0.699559,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
8,s2593817/sft-sql-embedding,SAME MEAING,date function between,tensor(0.9567),0.675,0.952862,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...


In [7]:
result[result.model_name == "RaduGabriel/BGE-M3-SQL"].sort_values(by=["category", "sub_categories"])

Unnamed: 0,model_name,category,sub_categories,cosine_scores,ast_scores,seq_matcher_score,query_a,query_b
0,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,DIFFERENT_QUERY_STRUCTURE,tensor(0.8125),0.431818,0.637615,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
3,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,col name changes,tensor(0.9006),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,difference joins,tensor(0.9568),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
4,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,having versus where,tensor(0.8720),0.6,0.746269,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9968),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
2,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,logic_filter_equal_check,tensor(0.9937),0.75,0.992701,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
7,RaduGabriel/BGE-M3-SQL,SAME MEAING,aliases,tensor(0.8752),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
6,RaduGabriel/BGE-M3-SQL,SAME MEAING,column order change only,tensor(0.9963),0.904762,0.476821,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
13,RaduGabriel/BGE-M3-SQL,SAME MEAING,cte versus subquery,tensor(0.9704),0.807692,0.699559,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
8,RaduGabriel/BGE-M3-SQL,SAME MEAING,date function between,tensor(0.9825),0.675,0.952862,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...


In [8]:
result[result.model_name == "dat-ai/bge-base-for_text2sql"].sort_values(by=["category", "sub_categories"])

Unnamed: 0,model_name,category,sub_categories,cosine_scores,ast_scores,seq_matcher_score,query_a,query_b
0,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,DIFFERENT_QUERY_STRUCTURE,tensor(0.6551),0.431818,0.637615,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
3,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,col name changes,tensor(0.7830),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,difference joins,tensor(0.8712),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
4,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,having versus where,tensor(0.7904),0.6,0.746269,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9942),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
2,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,logic_filter_equal_check,tensor(0.9924),0.75,0.992701,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
7,dat-ai/bge-base-for_text2sql,SAME MEAING,aliases,tensor(0.7127),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
6,dat-ai/bge-base-for_text2sql,SAME MEAING,column order change only,tensor(0.9822),0.904762,0.476821,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
13,dat-ai/bge-base-for_text2sql,SAME MEAING,cte versus subquery,tensor(0.7602),0.807692,0.699559,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
8,dat-ai/bge-base-for_text2sql,SAME MEAING,date function between,tensor(0.9396),0.675,0.952862,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...


## Result Analysis:  

* s2593817/sft-sql-embedding appear to be the best overall
* RaduGabriel/BGE-M3-SQL produces high score for all
* dat-ai/bge-base-for_text2sql appear to be doing better for the same meaing ones but does poorly in negative cases

### All embedding models does poorly in logic filters, column name changes in filtering/groupbys, and also joins

In [9]:
result[result.sub_categories.isin(["logic_filter_comparison", "logic_filter_equal_check", 
                                  "col name changes", "difference joins"])].sort_values(by="sub_categories")

Unnamed: 0,model_name,category,sub_categories,cosine_scores,ast_scores,seq_matcher_score,query_a,query_b
3,s2593817/sft-sql-embedding,DIFFERENT MEANING,col name changes,tensor(0.9711),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
3,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,col name changes,tensor(0.9006),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
3,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,col name changes,tensor(0.7830),0.1875,0.786241,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,s2593817/sft-sql-embedding,DIFFERENT MEANING,difference joins,tensor(0.9739),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,difference joins,tensor(0.9568),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
5,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,difference joins,tensor(0.8712),0.875,0.957576,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,s2593817/sft-sql-embedding,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9975),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,RaduGabriel/BGE-M3-SQL,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9968),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
1,dat-ai/bge-base-for_text2sql,DIFFERENT MEANING,logic_filter_comparison,tensor(0.9942),0.75,0.995122,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
2,s2593817/sft-sql-embedding,DIFFERENT MEANING,logic_filter_equal_check,tensor(0.9796),0.75,0.992701,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...


### All embedding models does inconsistently well in similar meaning ones especially in join versus subquery, over versus group by and alias

In [10]:
result[result.sub_categories.isin(["join versus subquery", "aliases", "over versus group by"])].sort_values(by="sub_categories")

Unnamed: 0,model_name,category,sub_categories,cosine_scores,ast_scores,seq_matcher_score,query_a,query_b
7,s2593817/sft-sql-embedding,SAME MEAING,aliases,tensor(0.5448),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
7,RaduGabriel/BGE-M3-SQL,SAME MEAING,aliases,tensor(0.8752),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
7,dat-ai/bge-base-for_text2sql,SAME MEAING,aliases,tensor(0.7127),0.257576,0.420891,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
16,s2593817/sft-sql-embedding,SAME MEAING,join versus subquery,tensor(0.5896),0.134615,0.688172,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
16,RaduGabriel/BGE-M3-SQL,SAME MEAING,join versus subquery,tensor(0.8486),0.134615,0.688172,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
16,dat-ai/bge-base-for_text2sql,SAME MEAING,join versus subquery,tensor(0.7393),0.134615,0.688172,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
15,s2593817/sft-sql-embedding,SAME MEAING,over versus group by,tensor(0.7444),0.708333,0.638298,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
15,RaduGabriel/BGE-M3-SQL,SAME MEAING,over versus group by,tensor(0.9446),0.708333,0.638298,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...
15,dat-ai/bge-base-for_text2sql,SAME MEAING,over versus group by,tensor(0.8470),0.708333,0.638298,SELECT name FROM students WHERE class_id IN (S...,SELECT name FROM students s LEFT JOIN classes ...


### The embedding model is good at handling minor syntactic variations where the keywords and structure remain similar.

### Conclusion:
This embedding model is effective at matching queries with minor syntactic differences but fails to capture the core logic of SQL. It cannot reliably distinguish between operators, join types, or clause functions (WHERE vs. HAVING), leading to dangerously incorrect assessments of similarity.