<a href="https://colab.research.google.com/github/orekhovsky/GenAI-mini-projects/blob/main/text2sql_examples.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install rouge-score -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


In [2]:
import pandas as pd
from google.colab import userdata
df = pd.read_json("hf://datasets/Clinton/Text-to-sql-v1/texttosqlv2.jsonl", lines=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
df

Unnamed: 0,instruction,input,response,source,text
0,Name the home team for carlton away team,CREATE TABLE table_name_77 (\n home_team VA...,SELECT home_team FROM table_name_77 WHERE away...,sql_create_context,Below are sql tables schemas paired with instr...
1,what will the population of Asia be when Latin...,"CREATE TABLE table_22767 (\n ""Year"" real,\n...","SELECT ""Asia"" FROM table_22767 WHERE ""Latin Am...",wikisql,Below are sql tables schemas paired with instr...
2,How many faculty members do we have for each g...,"CREATE TABLE Student (\n StuID INTEGER,\n ...","SELECT Sex, COUNT(*) FROM Faculty GROUP BY Sex...",nvbench,Below are sql tables schemas paired with instr...
3,List the record of 0-1 from the table?,CREATE TABLE table_14656147_2 (\n week VARC...,SELECT week FROM table_14656147_2 WHERE record...,sql_create_context,Below are sql tables schemas paired with instr...
4,"Which silver has a Gold smaller than 12, a Ran...",CREATE TABLE table_name_24 (\n silver VARCH...,SELECT silver FROM table_name_24 WHERE gold < ...,sql_create_context,Below are sql tables schemas paired with instr...
...,...,...,...,...,...
262203,Most viewed posts by tag w/o tag in post title.,"CREATE TABLE Comments (\n Id number,\n P...","SELECT Id AS ""post_link"", Tags, ViewCount, Cre...",sede,Below are sql tables schemas paired with instr...
262204,What is the time when ss12 is stage?,"CREATE TABLE table_24485 (\n ""Day"" text,\n ...","SELECT ""Time"" FROM table_24485 WHERE ""Stage"" =...",wikisql,Below are sql tables schemas paired with instr...
262205,count the number of patients whose year of dea...,CREATE TABLE demographic (\n subject_id tex...,SELECT COUNT(DISTINCT demographic.subject_id) ...,mimicsql_data,Below are sql tables schemas paired with instr...
262206,What was the styled used to defeat the opponen...,CREATE TABLE table_18598175_2 (\n method_of...,SELECT method_of_elimination FROM table_185981...,sql_create_context,Below are sql tables schemas paired with instr...


In [4]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
import pandas as pd
import requests
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Функция для вызова модели через API
def generate_sql_code(instruction, table_schema, api_key):
    prompt = f"""
    You are an expert SQL query generator. Answer only with valid SQL code, nothing else.
    Instruction: {instruction}
    Table Schema: {table_schema}
    SQL code:
    """
    input_data = {
        "model": "meta-llama/llama-3.2-1b-instruct",
        "messages": [{"role": "user", "content": prompt}]
    }
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps(input_data)
    )
    if response.status_code == 200:
        data = response.json()
        return data.get('choices', [{}])[0].get('message', {}).get('content', 'No content found')
    else:
        raise RuntimeError(f"API request failed with status {response.status_code}: {response.text}")

# Функция для вычисления метрик
def calculate_metrics(true_answer, model_response):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(true_answer, model_response)

    true_tokens = true_answer.split()
    response_tokens = model_response.split()
    smooth_fn = SmoothingFunction().method1
    bleu_score = sentence_bleu([true_tokens], response_tokens, smoothing_function=smooth_fn)

    meteor = meteor_score([true_answer.split()], model_response.split())

    return bleu_score, meteor, rouge_scores['rouge1'].fmeasure, rouge_scores['rouge2'].fmeasure, rouge_scores['rougeL'].fmeasure

import sqlite3

def test_sql_query(table_schema, sql_query):
    try:
        # Создаем временную базу данных SQLite в памяти
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()

        # Выполняем создание таблицы из схемы
        cursor.executescript(table_schema)

        # Выполняем сгенерированный SQL-запрос
        cursor.execute(sql_query)
        result = cursor.fetchall()

        # Закрываем соединение
        conn.close()

        return {"success": True, "result": result}
    except Exception as e:
        return {"success": False, "error": str(e)}

# Основной код
def evaluate_model(df, api_key, model_name="SQL Query Generator", summary_df=None):
    metrics_data = []  # Список для хранения строк данных
    for i, row in df.head(40).iterrows():  # Обрабатываем первые 40 строк
        instruction = row['instruction']
        table_schema = row['input']  # Используем описание таблицы
        true_response = row['response']

        try:
            # Генерация ответа модели
            model_response = generate_sql_code(instruction, table_schema, api_key)

            # Тестируем SQL-код
            test_result = test_sql_query(table_schema, model_response)
            sql_valid = test_result["success"]
            sql_error = test_result.get("error", "")

            # Вычисляем метрики
            bleu, meteor, rouge1, rouge2, rougeL = calculate_metrics(true_response, model_response)

            # Добавляем строку данных в список
            metrics_data.append({
                "Instruction": instruction,
                "Response": true_response,
                "Model Response": model_response,
                "BLEU": bleu,
                "METEOR": meteor,
                "ROUGE-1": rouge1,
                "ROUGE-2": rouge2,
                "ROUGE-L": rougeL,
                "SQL Valid": sql_valid,
                "SQL Error": sql_error
            })
        except Exception as e:
            print(f"Error processing row {i}: {e}")

    # Создаем датафрейм с метриками для каждого примера
    metrics_df = pd.DataFrame(metrics_data)

    # Рассчитываем средние значения метрик и процент правильных ответов
    average_bleu = metrics_df['BLEU'].mean()
    average_meteor = metrics_df['METEOR'].mean()
    average_rouge1 = metrics_df['ROUGE-1'].mean()
    average_rouge2 = metrics_df['ROUGE-2'].mean()
    average_rougeL = metrics_df['ROUGE-L'].mean()
    correct_responses_percentage = (metrics_df['SQL Valid'].sum() / len(metrics_df)) * 100

    # Создаем строку с метриками для текущей модели
    summary_data = {
        "Model": model_name,
        "Average BLEU": average_bleu,
        "Average METEOR": average_meteor,
        "Average ROUGE-1": average_rouge1,
        "Average ROUGE-2": average_rouge2,
        "Average ROUGE-L": average_rougeL,
        "Correct Responses (%)": correct_responses_percentage
    }

    # Если передан существующий summary_df, добавляем строку
    if summary_df is not None:
        summary_df = pd.concat([summary_df, pd.DataFrame([summary_data])], ignore_index=True)
    else:
        # Создаем новый датафрейм, если его нет
        summary_df = pd.DataFrame([summary_data])

    return metrics_df, summary_df

# Пример использования
OPENROUTER_API_KEY = userdata.get('OPEN_Router')

# Допустим, ранее был создан summary_df
previous_summary_df = pd.DataFrame()

# Оцениваем новую модель и обновляем summary_df
metrics_df, summary_df = evaluate_model(df, OPENROUTER_API_KEY, model_name="llama-3.2-1b-instruct", summary_df=previous_summary_df)

# Результаты
summary_df



Unnamed: 0,Model,Average BLEU,Average METEOR,Average ROUGE-1,Average ROUGE-2,Average ROUGE-L,Correct Responses (%)
0,llama-3.2-1b-instruct,0.347761,0.647689,0.780819,0.640046,0.751439,0.0


In [None]:
import pandas as pd
import requests
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Функция для вызова модели через API
def generate_sql_code(instruction, table_schema, api_key, model):
    prompt = f"""
    You are an expert SQL query generator. Answer only with valid SQL code, nothing else.
    Instruction: {instruction}
    Table Schema: {table_schema}
    SQL code:
    """
    input_data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}]
    }
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps(input_data)
    )
    if response.status_code == 200:
        data = response.json()
        return data.get('choices', [{}])[0].get('message', {}).get('content', 'No content found')
    else:
        raise RuntimeError(f"API request failed with status {response.status_code}: {response.text}")

# Функция для вычисления метрик
def calculate_metrics(true_answer, model_response):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(true_answer, model_response)

    true_tokens = true_answer.split()
    response_tokens = model_response.split()
    smooth_fn = SmoothingFunction().method1
    bleu_score = sentence_bleu([true_tokens], response_tokens, smoothing_function=smooth_fn)

    meteor = meteor_score([true_answer.split()], model_response.split())

    return bleu_score, meteor, rouge_scores['rouge1'].fmeasure, rouge_scores['rouge2'].fmeasure, rouge_scores['rougeL'].fmeasure

import sqlite3

def test_sql_query(table_schema, sql_query):
    try:
        # Создаем временную базу данных SQLite в памяти
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()

        # Выполняем создание таблицы из схемы
        cursor.executescript(table_schema)

        # Выполняем сгенерированный SQL-запрос
        cursor.execute(sql_query)
        result = cursor.fetchall()

        # Закрываем соединение
        conn.close()

        return {"success": True, "result": result}
    except Exception as e:
        return {"success": False, "error": str(e)}

# Основной код
def evaluate_model(df, api_key, model="meta-llama/llama-3.2-1b-instruct", summary_df=None):
    metrics_data = []  # Список для хранения строк данных
    for i, row in df.head(40).iterrows():  # Обрабатываем первые 40 строк
        instruction = row['instruction']
        table_schema = row['input']  # Используем описание таблицы
        true_response = row['response']

        try:
            # Генерация ответа модели
            model_response = generate_sql_code(instruction, table_schema, api_key, model)

            # Тестируем SQL-код
            test_result = test_sql_query(table_schema, model_response)
            sql_valid = test_result["success"]
            sql_error = test_result.get("error", "")

            # Вычисляем метрики
            bleu, meteor, rouge1, rouge2, rougeL = calculate_metrics(true_response, model_response)

            # Добавляем строку данных в список
            metrics_data.append({
                "Instruction": instruction,
                "Response": true_response,
                "Model Response": model_response,
                "BLEU": bleu,
                "METEOR": meteor,
                "ROUGE-1": rouge1,
                "ROUGE-2": rouge2,
                "ROUGE-L": rougeL,
                "SQL Valid": sql_valid,
                "SQL Error": sql_error
            })
        except Exception as e:
            print(f"Error processing row {i}: {e}")

    # Создаем датафрейм с метриками для каждого примера
    metrics_df = pd.DataFrame(metrics_data)

    # Рассчитываем средние значения метрик и процент правильных ответов
    average_bleu = metrics_df['BLEU'].mean()
    average_meteor = metrics_df['METEOR'].mean()
    average_rouge1 = metrics_df['ROUGE-1'].mean()
    average_rouge2 = metrics_df['ROUGE-2'].mean()
    average_rougeL = metrics_df['ROUGE-L'].mean()
    correct_responses_percentage = (metrics_df['SQL Valid'].sum() / len(metrics_df)) * 100

    # Создаем строку с метриками для текущей модели
    summary_data = {
        "Model": model,
        "Average BLEU": average_bleu,
        "Average METEOR": average_meteor,
        "Average ROUGE-1": average_rouge1,
        "Average ROUGE-2": average_rouge2,
        "Average ROUGE-L": average_rougeL,
        "Correct Responses (%)": correct_responses_percentage
    }

    # Если передан существующий summary_df, добавляем строку
    if summary_df is not None:
        summary_df = pd.concat([summary_df, pd.DataFrame([summary_data])], ignore_index=True)
    else:
        # Создаем новый датафрейм, если его нет
        summary_df = pd.DataFrame([summary_data])

    return metrics_df, summary_df

# Пример использования
OPENROUTER_API_KEY = userdata.get('OPEN_Router')

# Инициализация модели (меняется только здесь)
model_path = "deepseek/deepseek-chat"

# Оцениваем модель и получаем summary_df
metrics_df, summary_df = evaluate_model(df, OPENROUTER_API_KEY, model=model_path)

# Результаты
summary_df


Unnamed: 0,Model,Average BLEU,Average METEOR,Average ROUGE-1,Average ROUGE-2,Average ROUGE-L,Correct Responses (%)
0,deepseek/deepseek-chat,0.357821,0.649536,0.785041,0.645557,0.754927,0.0


In [5]:
import pandas as pd
import requests
import json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

# Функция для вызова модели через API
def generate_sql_code(instruction, table_schema, api_key, model):
    prompt = f"""
    You are an expert SQL query generator. Answer only with valid SQL code, nothing else.
    Instruction: {instruction}
    Table Schema: {table_schema}
    SQL code:
    """
    input_data = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}]
    }
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {api_key}",
            "Content-Type": "application/json",
        },
        data=json.dumps(input_data)
    )
    if response.status_code == 200:
        data = response.json()
        return data.get('choices', [{}])[0].get('message', {}).get('content', 'No content found')
    else:
        raise RuntimeError(f"API request failed with status {response.status_code}: {response.text}")

# Функция для вычисления метрик
def calculate_metrics(true_answer, model_response):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = scorer.score(true_answer, model_response)

    true_tokens = true_answer.split()
    response_tokens = model_response.split()
    smooth_fn = SmoothingFunction().method1
    bleu_score = sentence_bleu([true_tokens], response_tokens, smoothing_function=smooth_fn)

    meteor = meteor_score([true_answer.split()], model_response.split())

    return bleu_score, meteor, rouge_scores['rouge1'].fmeasure, rouge_scores['rouge2'].fmeasure, rouge_scores['rougeL'].fmeasure

import sqlite3

def test_sql_query(table_schema, sql_query):
    try:
        # Создаем временную базу данных SQLite в памяти
        conn = sqlite3.connect(":memory:")
        cursor = conn.cursor()

        # Выполняем создание таблицы из схемы
        cursor.executescript(table_schema)

        # Выполняем сгенерированный SQL-запрос
        cursor.execute(sql_query)
        result = cursor.fetchall()

        # Закрываем соединение
        conn.close()

        return {"success": True, "result": result}
    except Exception as e:
        return {"success": False, "error": str(e)}

# Основной код
def evaluate_model(df, api_key, model_name="SQL Query Generator", summary_df=None):
    metrics_data = []  # Список для хранения строк данных
    for i, row in df.head(30).iterrows():  # Обрабатываем первые 30 строк
        instruction = row['instruction']
        table_schema = row['input']  # Используем описание таблицы
        true_response = row['response']

        try:
            # Генерация ответа модели
            model_response = generate_sql_code(instruction, table_schema, api_key, model_name)

            # Тестируем SQL-код
            test_result = test_sql_query(table_schema, model_response)
            sql_valid = test_result["success"]
            sql_error = test_result.get("error", "")

            # Вычисляем метрики
            bleu, meteor, rouge1, rouge2, rougeL = calculate_metrics(true_response, model_response)

            # Добавляем строку данных в список
            metrics_data.append({
                "Instruction": instruction,
                "Response": true_response,
                "Model Response": model_response,
                "BLEU": bleu,
                "METEOR": meteor,
                "ROUGE-1": rouge1,
                "ROUGE-2": rouge2,
                "ROUGE-L": rougeL,
                "SQL Valid": sql_valid,
                "SQL Error": sql_error
            })
        except Exception as e:
            print(f"Error processing row {i}: {e}")

    # Создаем датафрейм с метриками для каждого примера
    metrics_df = pd.DataFrame(metrics_data)

    # Рассчитываем средние значения метрик и процент правильных ответов
    average_bleu = metrics_df['BLEU'].mean()
    average_meteor = metrics_df['METEOR'].mean()
    average_rouge1 = metrics_df['ROUGE-1'].mean()
    average_rouge2 = metrics_df['ROUGE-2'].mean()
    average_rougeL = metrics_df['ROUGE-L'].mean()
    correct_responses_percentage = (metrics_df['SQL Valid'].sum() / len(metrics_df)) * 100

    # Создаем строку с метриками для текущей модели
    summary_data = {
        "Model": model_name,
        "Average BLEU": average_bleu,
        "Average METEOR": average_meteor,
        "Average ROUGE-1": average_rouge1,
        "Average ROUGE-2": average_rouge2,
        "Average ROUGE-L": average_rougeL,
        "Correct Responses (%)": correct_responses_percentage
    }

    # Если передан существующий summary_df, добавляем строку
    if summary_df is not None:
        summary_df = pd.concat([summary_df, pd.DataFrame([summary_data])], ignore_index=True)
    else:
        # Создаем новый датафрейм, если его нет
        summary_df = pd.DataFrame([summary_data])

    return metrics_df, summary_df

# Пример использования
OPENROUTER_API_KEY = userdata.get('OPEN_Router')

# Список моделей для оценки
models = [
    "meta-llama/llama-3.2-1b-instruct",
    "deepseek/deepseek-chat",
    "qwen/qvq-72b-preview",
    "google/gemini-2.0-flash-thinking-exp:free",
    "sao10k/l3.3-euryale-70b"
]

# Инициализация пустого summary_df
summary_df = pd.DataFrame()

# Оценка всех моделей и добавление результатов в summary_df
for model in models:
    metrics_df, summary_df = evaluate_model(df, OPENROUTER_API_KEY, model_name=model, summary_df=summary_df)

# Результаты
summary_df


Unnamed: 0,Model,Average BLEU,Average METEOR,Average ROUGE-1,Average ROUGE-2,Average ROUGE-L,Correct Responses (%)
0,meta-llama/llama-3.2-1b-instruct,0.209649,0.479784,0.738945,0.559403,0.677429,46.666667
1,deepseek/deepseek-chat,0.400316,0.681937,0.809426,0.680252,0.77199,6.666667
2,qwen/qvq-72b-preview,0.005466,0.089474,0.036557,0.027601,0.035239,0.0
3,google/gemini-2.0-flash-thinking-exp:free,0.037817,0.065801,0.077112,0.066963,0.075524,0.0
4,sao10k/l3.3-euryale-70b,0.0,0.0,0.003704,0.0,0.003704,0.0


In [16]:
df_for_test = df

# Считаем вхождения 'CREATE TABLE' в каждой строке
df_for_test['create_table_count'] = df_for_test['input'].str.count('CREATE TABLE')

# Находим максимальное значение
max_tables_in_one_line = df_for_test['create_table_count'].max()

# Находим индексы строк с максимальным значением
max_indices = df_for_test[df_for_test['create_table_count'] == max_tables_in_one_line].index.tolist()

print(f"Максимальное количество таблиц в одной строке: {max_tables_in_one_line}")
print(f"Индексы строк с максимальным значением: {max_indices}")


Максимальное количество таблиц в одной строке: 29
Индексы строк с максимальным значением: [46, 52, 75, 96, 104, 201, 259, 282, 316, 346, 361, 420, 437, 459, 473, 490, 498, 519, 584, 613, 639, 693, 699, 705, 714, 737, 820, 875, 908, 919, 990, 1005, 1014, 1015, 1050, 1051, 1084, 1165, 1231, 1247, 1274, 1284, 1321, 1352, 1373, 1472, 1542, 1611, 1650, 1676, 1686, 1719, 1730, 1736, 1744, 1773, 1802, 1842, 1854, 1858, 1902, 1917, 1961, 1982, 2022, 2031, 2035, 2043, 2057, 2118, 2176, 2212, 2299, 2311, 2334, 2347, 2365, 2386, 2435, 2457, 2466, 2536, 2574, 2584, 2624, 2645, 2659, 2667, 2702, 2706, 2713, 2748, 2757, 2767, 2789, 2796, 2798, 2839, 2852, 2938, 2978, 3021, 3040, 3072, 3112, 3141, 3173, 3179, 3185, 3195, 3218, 3240, 3268, 3296, 3308, 3354, 3361, 3364, 3389, 3402, 3480, 3482, 3484, 3523, 3544, 3587, 3662, 3692, 3699, 3701, 3725, 3727, 3734, 3741, 3745, 3749, 3833, 3863, 3880, 3928, 3931, 3983, 3995, 4011, 4033, 4065, 4082, 4108, 4113, 4165, 4203, 4213, 4226, 4249, 4253, 4255, 4257, 42

In [20]:
len(max_indices)

10050

In [10]:

# Опционально: вывод самих строк для проверки
if max_indices:
    print("\nСтроки с максимальным количеством таблиц:")
    print(df_for_test.loc[max_indices, ['input', 'create_table_count']])


Строки с максимальным количеством таблиц:
                                                input  create_table_count
46  CREATE TABLE ReviewTaskResults (\n    Id numbe...                  29
52  CREATE TABLE PendingFlags (\n    Id number,\n ...                  29
75  CREATE TABLE SuggestedEditVotes (\n    Id numb...                  29
96  CREATE TABLE ReviewTaskResultTypes (\n    Id n...                  29


In [13]:
df_for_test['input'][46]

'CREATE TABLE ReviewTaskResults (\n    Id number,\n    ReviewTaskId number,\n    ReviewTaskResultTypeId number,\n    CreationDate time,\n    RejectionReasonId number,\n    Comment text\n)\n\nCREATE TABLE CloseAsOffTopicReasonTypes (\n    Id number,\n    IsUniversal boolean,\n    InputTitle text,\n    MarkdownInputGuidance text,\n    MarkdownPostOwnerGuidance text,\n    MarkdownPrivilegedUserGuidance text,\n    MarkdownConcensusDescription text,\n    CreationDate time,\n    CreationModeratorId number,\n    ApprovalDate time,\n    ApprovalModeratorId number,\n    DeactivationDate time,\n    DeactivationModeratorId number\n)\n\nCREATE TABLE SuggestedEdits (\n    Id number,\n    PostId number,\n    CreationDate time,\n    ApprovalDate time,\n    RejectionDate time,\n    OwnerUserId number,\n    Comment text,\n    Text text,\n    Title text,\n    Tags text,\n    RevisionGUID other\n)\n\nCREATE TABLE FlagTypes (\n    Id number,\n    Name text,\n    Description text\n)\n\nCREATE TABLE PostFee