In [1]:
!pip install -q transformers==4.41.2 datasets==2.19.1 torch==2.3.0 accelerate==0.30.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m67.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.2/779.2 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m151.7 kB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.6/410.6 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m

In [2]:
import shutil

src = "/kaggle/input/test-set/test_set_without_xt_2"
dst = "/kaggle/working/test_set_without_xt_2"

shutil.copytree(src, dst, dirs_exist_ok=True)
print(f"Copiato dataset da {src} a {dst}")


Copiato dataset da /kaggle/input/test-set/test_set_without_xt_2 a /kaggle/working/test_set_without_xt_2


In [3]:
#!/usr/bin/env python
import os
import torch
from datasets import load_from_disk
from transformers import AutoTokenizer, AutoModelForCausalLM

# Modello in FP16 (senza quantizzazione)
MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"

torch.backends.cuda.matmul.allow_tf32 = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Carica modello/tokenizer (FP16) e manda su GPU
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
).to(device).eval()

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

@torch.inference_mode()
def generate_xt_text(prompt: str) -> str:
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    outputs = model.generate(
        **inputs,
        max_new_tokens=128,  # tienilo più basso per evitare OOM
        do_sample=True,
        top_k=50,
        temperature=0.8,
        eos_token_id=tokenizer.eos_token_id
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text.replace(prompt, "").strip()

def build_prompt(sample: dict) -> str:
    objects_description = sample['objects_description']
    color_description = sample['color_description']
    brightness_description = sample['brightness_description']
    Sharpness_analysis = sample['Sharpness_analysis']
    light_analysis = sample['light_analysis']
    depth_analysis = sample['depth_analysis']
    incongruence_analysis = sample['incongruence_analysis']
    texture_analysis = sample['texture_analysis']
    return f"""You are a visual analysis expert tasked with examining an image based on both descriptive and technical cues.

            Below is the information available:
            
            Descriptive details (natural language):
            - Objects: {objects_description}
            - Colors: {color_description}
            - Brightness: {brightness_description}
            
            Technical visual indicators (numerical/structured):
            - Sharpness: {Sharpness_analysis}
            - Texture metrics: {texture_analysis}
            - Light direction & consistency: {light_analysis}
            - Depth discontinuities: {depth_analysis}
            - Semantic coherence: {incongruence_analysis}
            
            Write a fluent and insightful paragraph that combines the above elements into a coherent analysis. You must:
            
            - Start by describing the scene naturally, using the objects, colors, and brightness.
            - Seamlessly integrate the technical indicators, without naming them explicitly.
            - Highlight any potential visual contradictions (e.g., overly sharp edges, mismatched lighting, flat depth in complex scenes, or semantically illogical object placement).
            - Be concise but precise: avoid generic or vague statements.
            - Do **not** list values or scores directly, but **reflect their effects** in your analysis.
            - End with a contextual reflection that hints at whether the scene may contain digital alterations — **without making a definitive judgment**.
            
            Only write the paragraph. Do not preface or follow it with explanations or extra commentary.

            Begin your output here:"""

def main():
    ds = load_from_disk("/kaggle/working/test_set_without_xt_2")

    def add_xt(example):
        xt = generate_xt_text(build_prompt(example))
        return {"x_t": xt}

    ds = ds.map(add_xt, batched=False)
    ds.save_to_disk("/kaggle/working/test_set_with_xt")
    print("Salvato in /kaggle/working/test_set_with_xt")

if __name__ == "__main__":
    print("CUDA available:", torch.cuda.is_available())
    if torch.cuda.is_available():
        print("GPU name:", torch.cuda.get_device_name(0))
        print("Total VRAM (GB):", round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2))
    main()


config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

CUDA available: True
GPU name: Tesla T4
Total VRAM (GB): 14.74




Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

Saving the dataset (0/1 shards):   0%|          | 0/1001 [00:00<?, ? examples/s]

Salvato in /kaggle/working/test_set_with_xt


In [5]:
!zip -r /kaggle/working/test_set_with_xt.zip /kaggle/working/test_set_with_xt


  adding: kaggle/working/test_set_with_xt/ (stored 0%)
  adding: kaggle/working/test_set_with_xt/dataset_info.json (deflated 83%)
  adding: kaggle/working/test_set_with_xt/state.json (deflated 53%)
  adding: kaggle/working/test_set_with_xt/data-00000-of-00001.arrow

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 83%)


In [6]:
ds_loaded = load_from_disk("/kaggle/working/test_set_with_xt")

In [7]:
print(ds_loaded)

Dataset({
    features: ['img_id', 'image', 'image_k', 'image_ssh', 'label', 'x_t', 'Sharpness_analysis', 'texture_analysis', 'light_analysis', 'depth_analysis', 'incongruence_analysis', 'color_description', 'objects_description', 'brightness_description'],
    num_rows: 1001
})


In [10]:
from datasets import Dataset
def clean_x_t_column(ds: Dataset) -> Dataset:
    """
    Prende un Dataset HF con una colonna 'x_t' contenente
    l'intera risposta (prompt + paragrafo generato) e la sostituisce
    con solo il testo che compare dopo 'Begin your output here:'.
    """
    def extract_output(example):
        full = example.get("x_t", "")
        marker = "Begin your output here:"
        idx = full.find(marker)
        if idx == -1:
            # se il marker non c'è, restituisci tutto senza spazi estremi
            cleaned = full.strip()
        else:
            # altrimenti prendi tutto dopo il marker, togli spazi iniziali
            cleaned = full[idx + len(marker):].lstrip()
        return {"x_t": cleaned}

    # Applica la trasformazione su ogni esempio
    return ds.map(extract_output, batched=False)

In [11]:
ds_loaded = clean_x_t_column(ds_loaded)

Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

In [15]:
print(ds_loaded[1000]['x_t'])

The given scene is an outdoor area, with a predominantly green landscape and a few natural objects scattered around. The colors appear natural and vivid, with moderate contrast that allows for realistic object segmentation. The lighting is balanced, with no significant discontinuities, except for a slight discrepancy in the left corner, which should not affect the scene's overall coherence. While the depth perception is satisfactory in simple scenes, such as the one with the bicycle, it flattens out in more complex environments, such as the background with the buildings. The object placement is mostly logical, although the


In [16]:
ds_loaded.save_to_disk("/kaggle/working/new_dataset")

Saving the dataset (0/1 shards):   0%|          | 0/1001 [00:00<?, ? examples/s]

In [17]:
!zip -r /kaggle/working/new_dataset.zip /kaggle/working/new_dataset


  adding: kaggle/working/new_dataset/ (stored 0%)
  adding: kaggle/working/new_dataset/dataset_info.json (deflated 83%)
  adding: kaggle/working/new_dataset/state.json (deflated 53%)
  adding: kaggle/working/new_dataset/data-00000-of-00001.arrow (deflated 75%)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
