In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import sqlite3
import os

from src.data.data_loading import load_config
from src.data.database import create_finetuning_data_from_db
from src.finetuning import create_finetuning_data_sample, save_finetuning_data_as_json

# Set working directory

In [3]:
# set working directory to the root of the project
os.chdir("..")
os.getcwd()

'C:\\Users\\merti\\PycharmProjects\\cadenza-playwright-llm'

# Load data + config

In [4]:
db_file = './data/raw/playwright_script.db'
config = load_config("./config/config.yaml")

# Data Preparation
For Finetuning with the [Llava model](https://github.com/haotian-liu/LLaVA) we need to prepare the data in a specific json format. The data should be in the following format:

``` json
[
    {
        "id": "unique_id",
        "image": "image_file.jpg",
        "conversations": [
            {

                "from": "human",
                "value": "What is shown in the image?"

            },
            {
                "from": "gpt",
                "value": "formatted_answers"
            }
        ]
    },
    {...}
]

```
Each finetuning data sample consists of:
* id: a unique identifier for the sample
* image: the image file name
* conversations: a list of questions and answers. Each conversation consists of:
    * from: the speaker of the conversation (either "human" or "gpt")
    * value: the text of the conversation (either the question asked by the human or the answer expected to be generated by the model)

In the end we will have a json file as finetuning input which consits of a list of finetuning data samples, each being a dictionary with the keys "id", "image" and "conversations".

The functionality to create a sample following this structure is implemented in the `create_finetuning_data_sample` function. This function takes the paths of the context and creates the input prompt following our selected template and combines it with the expected output. The function returns a dictionary matching the required format for described above.
For automatic generation of the whole finetuning data we use the `create_finetuning_data_from_db` function which takes a list of ids and the path to the database and returns a whole list of finetuning data samples. It also makes use of the `create_finetuning_data_sample` function to create each sample.



In [5]:
# Load ids to be used for finetuning from database
conn = sqlite3.connect(db_file)
c = conn.cursor()
c.execute('SELECT id FROM tests')
items = c.fetchall()
conn.close()

ids = [i[0] for i in items][1:] # Skip the first id since it is not possible to get the previous id

In [6]:
finetuning_data = create_finetuning_data_from_db(ids, db_file, config)

2024-07-15 14:39:14 [[34msrc.input_builder:30[0m] [[32mINFO[0m] >>>> Loading context...[0m
2024-07-15 14:39:14 [[34msrc.input_builder:46[0m] [[32mINFO[0m] >>>> Context loaded successfully.[0m
2024-07-15 14:39:14 [[34msrc.input_builder:48[0m] [[32mINFO[0m] >>>> Creating input prompt...[0m
2024-07-15 14:39:14 [[34msrc.input_builder:79[0m] [[32mINFO[0m] >>>> Input prompt created successfully.[0m
2024-07-15 14:39:14 [[34msrc.finetuning:33[0m] [[32mINFO[0m] >>>> Combining input and expected output into json finetuning data format...[0m
2024-07-15 14:39:14 [[34msrc.finetuning:50[0m] [[32mINFO[0m] >>>> Finetuning conversation created successfully.[0m
2024-07-15 14:39:14 [[34msrc.input_builder:30[0m] [[32mINFO[0m] >>>> Loading context...[0m
2024-07-15 14:39:15 [[34msrc.input_builder:46[0m] [[32mINFO[0m] >>>> Context loaded successfully.[0m
2024-07-15 14:39:15 [[34msrc.input_builder:48[0m] [[32mINFO[0m] >>>> Creating input prompt...[0m
2024-07-15 14:

In [7]:
finetuning_data[0]

{'id': '01_01',
 'image': '.\\data\\raw\\.\\screenshot\\1_1.png',
 'conversations': [{'from': 'human',
   'value': '### Simplified HTML Content:\nButtons: \n{"id": "navigationTrigger", "class": "button button-icon button-borderless"}\n{"id": "workbook-create", "class": "button workbook-create button-icon"}\nInputs: \n{"class": "select2-search__field", "aria-label": "Suchen nach …", "type": "search", "placeholder": "Suchen nach …"}\nLinks: \n{"text": "Zum Navigatorbaum springen", "id": "skip-to-navigator", "class": "button button-primary"}\n{"text": "Zum Hauptbereich springen", "id": "skip-to-content", "class": "button button-primary"}\n{"text": "Startseite", "id": "home", "class": "button button-icon button-borderless"}\n{"text": "Karte", "class": "button button-icon button-borderless d-topnav--map-button"}\n{"text": "Verzeichnis Tutorial", "id": "d-nav-tree-node_ROOT-Tutorial_firstContent", "class": "d-nav-tree-node--main d-hover-context"}\n{"text": "Verzeichnis Gewässergüte", "id": "

In [8]:
save_finetuning_data_as_json(finetuning_data)

2024-07-15 14:39:27 [[34msrc.finetuning:69[0m] [[32mINFO[0m] >>>> Finetuning data saved as JSON file: ./data/finetuning/s77_finetuning_data_20240715-143927.json[0m
