In [8]:
import os
import json
from docx import Document

docx_folder = "data"
output_json_file = "dataset.json"

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    content = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

    # Separating story and storyboard content (you can adjust this logic based on the document's structure)
    story_start = content.find("ORİJİNAL FIKRA")
    story_end = content.find("Karakter Özellikleri")
    storyboard_start = content.find("Sahne")

    # Extracting the story and storyboard parts
    story = content[story_start:story_end].strip()  # Original Fable
    storyboard = content[storyboard_start:].strip()  # Storyboard Content

    return story, storyboard

def load_all_documents(folder_path):
    dataset = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".docx"):  # Process only .docx files
            file_path = os.path.join(folder_path, filename)
            story, storyboard = extract_text_from_docx(file_path)

            # Create a structured entry with both story and storyboard content
            entry = {
                "id": filename.replace(".docx", ""),  # Use filename as ID
                "conversations": [
                    {"from": "user", "value": story},
                    {"from": "assistant", "value": storyboard}
                ]
            }
            dataset.append(entry)

    return dataset

In [9]:
# Load all documents and convert to structured format
dataset = load_all_documents(docx_folder)

# Save dataset to JSON
with open(output_json_file, "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

print(f"Dataset saved to {output_json_file}, with {len(dataset)} documents.")

# Save dataset as JSONL (newline-delimited JSON)
with open("dataset.jsonl", "w") as f:
    for entry in dataset:
        f.write(json.dumps(entry) + "\n")

print("Dataset saved to dataset.jsonl")

Dataset saved to dataset.json, with 3 documents.
Dataset saved to dataset.jsonl


In [19]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()  # This frees up unused memory
model_name = "Qwen/Qwen2.5-7B-Instruct"
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, local_files_only=True)

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, local_files_only=True, torch_dtype="auto")


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [20]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="dataset.json")
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"]
)

model = get_peft_model(model, peft_config)

Generating train split: 0 examples [00:00, ? examples/s]

In [21]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./qwen_storyboard_model",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=3,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    eval_strategy="epoch",
    eval_steps=500,
    fp16=True,
    gradient_accumulation_steps=8,
)

print("TrainingArguments successfully initialized!")

TrainingArguments successfully initialized!


In [22]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="dataset.json")

dataset = dataset["train"].train_test_split(test_size=0.2)

print(dataset)  # Should now show both 'train' and 'test'
from transformers import Trainer, AutoTokenizer
from datasets import Dataset, DatasetDict

if len(dataset['train']) > 1:  # Ensure there's more than one sample to split

    def tokenize_function(examples):
        batch_conversations = [
            " ".join([msg["value"] for msg in conv]) for conv in examples["conversations"]
        ]
        return tokenizer(batch_conversations, padding="max_length", truncation=True)


    dataset = dataset['train'].train_test_split(test_size=0.2)

    train_dataset = dataset['train'].map(tokenize_function, batched=True)
    eval_dataset = dataset['test'].map(tokenize_function, batched=True)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

else:
    print("Not enough samples to train. Skipping the training process.")


DatasetDict({
    train: Dataset({
        features: ['id', 'conversations'],
        num_rows: 2
    })
    test: Dataset({
        features: ['id', 'conversations'],
        num_rows: 1
    })
})


Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [23]:
model.save_pretrained("./qwen_storyboard_model")
tokenizer.save_pretrained("./qwen_storyboard_model")
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "./qwen_storyboard_model"

model = AutoModelForCausalLM.from_pretrained(model_name)  # Remove local_files_only=True if needed
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
input_text = "Create an storyboard in Turkish with given text after this and do not use images, instead explain these images as much as you can, tell what every character does, do not change original story just explain things happening scene by scene.Raw Story:\n Nasreddin Hoca bir gün yolda giderken bir adamla karşılaşmış. Adamla sohbet etmeye başlamışlar. Bir saat havadan sudan konuştuktan sonra Hoca:– Kusura bakma arkadaş. Ben seni tanıyamadım, adın neydi?, diye sormuş.Adamcağız çok şaşırmış:– Madem beni tanımadın, neden benimle bir saattir sohbet ediyorsun?, demiş.Nasreddin Hoca:– Kıyafetlerin benimkine çok benziyordu. Ben de seni ben sandım, demiş. StoryBoard:\n"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=4096, do_sample=True)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(generated_text)