# Project PFE : Dataset parsing with LLM
Compare LLM-based standardization against keyword and embedding baselines.

## 1. Imports & Setup

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os

# Local modules
from baselines import baseline_keyword_match, baseline_embedding_match
from standardize import load_standardized_dataset

from datasets import load_dataset as hf_load_dataset  
from unitxt import load_dataset as unitxt_load_dataset
from unitxt import get_from_catalog
from unitxt import get_from_catalog

import logging
logging.getLogger("unitxt").setLevel(logging.ERROR)
print("Imports done successfully")



Imports done successfully


In [2]:
def get_valid_experiments():
    """
    Parcourt le catalogue Unitxt et retourne une liste de dictionnaires
    pour les tâches de Classification, NLI, SFT et QA.
    """
    from unitxt.catalog import get_local_catalogs_paths
    print("Construction de la liste des expériences...")
    
    catalog_paths = get_local_catalogs_paths()
    all_card_names = []
    
    target_keywords = [
        'classification', 'nli', 'glue', 'xnli',
        'sft', 'instruction',
        'qa', 'question', 'answer', 'squad'
    ]
    
    for path in catalog_paths:
        cards_dir = os.path.join(path, 'cards')
        if os.path.exists(cards_dir):
            for root, _, files in os.walk(cards_dir):
                for file in files:
                    if file.endswith('.json'):
                        rel_path = os.path.relpath(os.path.join(root, file), cards_dir)
                        card_id = rel_path.replace(os.sep, '.').replace('.json', '')
                        if any(kw in card_id.lower() for kw in target_keywords):
                            all_card_names.append(card_id)
    
    valid_experiments = []
    print(f"Inspection de {len(all_card_names)} cartes potentielles...")
    
    for card_id in all_card_names:
        try:
            if "test" in card_id.lower() or "mock" in card_id.lower():
                continue

            card = get_from_catalog(f"cards.{card_id}")
            
            if hasattr(card, 'loader') and hasattr(card.loader, 'path'):
                hf_name = card.loader.path
                hf_config = getattr(card.loader, 'config_name', None)
                
                # FIX: For GLUE-like datasets, use card_id as config fallback
                if hf_config is None and 'glue' in hf_name.lower():
                    hf_config = card_id.split('.')[-1]  # e.g., 'qnli' from 'glue.qnli'
                
                if not hf_name:
                    continue

                valid_experiments.append({
                    "card_id": card_id,
                    "hf_name": hf_name,
                    "hf_config": hf_config
                })
        except Exception:
            continue
            
    print(f"✅ Trouvé {len(valid_experiments)} datasets valides.")
    return valid_experiments

# --- TEST ---
experiments_list = get_valid_experiments()
print("Exemple des 5 premiers :")
for exp in experiments_list[:5]:
    print(exp)

Construction de la liste des expériences...
Inspection de 117 cartes potentielles...
✅ Trouvé 99 datasets valides.
Exemple des 5 premiers :
{'card_id': 'pop_qa_robust', 'hf_name': 'akariasai/PopQA', 'hf_config': None}
{'card_id': 'qnli', 'hf_name': 'nyu-mll/glue', 'hf_config': 'qnli'}
{'card_id': 'mnli', 'hf_name': 'nyu-mll/glue', 'hf_config': 'mnli'}
{'card_id': 'openbook_qa', 'hf_name': 'openbookqa', 'hf_config': None}
{'card_id': 'wnli', 'hf_name': 'nyu-mll/glue', 'hf_config': 'wnli'}


## 2. Baseline models

In [3]:
test_dataset = experiments_list[1]
print(test_dataset)

{'card_id': 'qnli', 'hf_name': 'nyu-mll/glue', 'hf_config': 'qnli'}


In [4]:
def display_experiment_results(dataset_name, card_id, config, methods):
    print(f"EXPERIMENT: {card_id}")
    print(f"Source Data: {dataset_name} (Config: {config})")
    
    if config:
        ds_raw = hf_load_dataset(dataset_name, config, split="train", streaming=True)
    else:
        ds_raw = hf_load_dataset(dataset_name, split="train", streaming=True)
    
    raw_samples = list(ds_raw.take(5))
    df_raw = pd.DataFrame(raw_samples)
    print(f"Raw Data Loaded ({len(df_raw)} rows)")

    for method_name, method_func in methods.items():
        print(f"\n{'='*10} Method: {method_name} {'='*10}")
        result = method_func(dataset_name, config=config)
        mapping = result.get("mapping", {})
        print(f"Detected Mapping: {mapping}")
        
        rename_dict = {v: k for k, v in mapping.items() if k != 'task' and isinstance(v, str)}
        df_pred = df_raw.copy()
        valid_renames = {k: v for k, v in rename_dict.items() if k in df_pred.columns}
        df_pred = df_pred.rename(columns=valid_renames)
        
        target_cols = [k for k in mapping.keys() if k != 'task']
        cols_to_show = [c for c in target_cols if c in df_pred.columns]
        
        if cols_to_show:
            display(df_pred[cols_to_show])
        else:
            print("No target columns found.")
            display(df_pred.head(5))
        

# TEST on 1 sample dataset
methods_to_test = {
    "Keyword Baseline": baseline_keyword_match,
    "Embedding Baseline": baseline_embedding_match,
}


display_experiment_results( test_dataset["hf_name"], test_dataset["card_id"], test_dataset["hf_config"], methods_to_test)

EXPERIMENT: qnli
Source Data: nyu-mll/glue (Config: qnli)
Raw Data Loaded (5 rows)

Detected Mapping: {'text': 'sentence', 'label': 'label', 'question': 'question', 'task': 'qa'}


Unnamed: 0,text,label,question
0,Unlike the two seasons before it and most of t...,1,When did the third Digimon series begin?
1,"When MANPADS is operated by specialists, batte...",1,Which missile batteries often have individual ...
2,He bases this interpretation on the fact that ...,0,What two things does Popper argue Tarski's the...
3,"On 31 December 1853, the Ottoman forces at Cal...",0,What is the name of the village 9 miles north ...
4,London contains four World Heritage Sites: the...,1,What famous palace is located in London?



Detected Mapping: {'label': 'label', 'question': 'question', 'answer': 'question', 'task': 'qa'}


Unnamed: 0,label,answer
0,1,When did the third Digimon series begin?
1,1,Which missile batteries often have individual ...
2,0,What two things does Popper argue Tarski's the...
3,0,What is the name of the village 9 miles north ...
4,1,What famous palace is located in London?


In [5]:
# TO DO : experiment the evaluation function on thoses methods

## 3. LLM-based model
