In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re
import json
import os



script_folder_path = '/content/drive/My Drive/movie_script_project/raw_txt/'

script_filenames = [
    'pulp_fiction.txt',
    'reservior_dogs.txt',
    'jb.txt',
    'ing_bastards.txt',
    'nbk.txt'
]

script_full_paths = [os.path.join(script_folder_path, fname) for fname in script_filenames]


def load_script_text(filepath: str) -> str:
    """Loads a script from a .txt file."""
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            print(f"Loaded {os.path.basename(filepath)}")
            return file.read()
    except FileNotFoundError:
        print(f"File not found at {filepath}")
        return None

def split_into_scenes(full_script_text: str) -> list:
    """Splits the entire script text into a list of scene strings."""
    pattern = r"((?:\d+\s)?(?:INT\.?|EXT\.?).*)"

    split_text = re.split(pattern, full_script_text, flags=re.MULTILINE)
    scenes = []
    if split_text:
        for i in range(1, len(split_text), 2):
            scenes.append((split_text[i] + split_text[i+1]).strip())

    print(f"   - Split into {len(scenes)} scenes.")
    return scenes

def parse_scene(scene_text: str) -> dict:
    """Parses a single scene string into a structured dictionary."""
    lines = scene_text.strip().split('\n')
    scene_heading = lines[0].strip()
    action_lines = []
    dialogue_blocks = []
    i = 1
    while i < len(lines):
        line = lines[i]
        line_stripped = line.strip()
        is_character = line_stripped.isupper() and len(line) - len(line.lstrip(' ')) > 10 and len(line_stripped) > 0 and len(line_stripped) < 30
        if is_character:
            character_name = line_stripped
            dialogue_lines = []
            i += 1
            while i < len(lines):
                dialogue_line = lines[i].strip()
                is_dialogue_line = len(lines[i]) - len(lines[i].lstrip(' ')) > 5
                if dialogue_line and is_dialogue_line:
                    dialogue_lines.append(dialogue_line)
                    i += 1
                else:
                    break
            if dialogue_lines:
                dialogue_blocks.append({"character": character_name, "dialogue": " ".join(dialogue_lines)})
        else:
            if line_stripped:
                action_lines.append(line_stripped)
            i += 1
    return {"scene_heading": scene_heading, "action_lines": action_lines, "dialogue_blocks": dialogue_blocks}


all_parsed_scenes = []
for filepath in script_full_paths:
    full_script_text = load_script_text(filepath)
    if full_script_text:
        scenes = split_into_scenes(full_script_text)
        for scene_text in scenes:
            if scene_text:
                parsed_scene = parse_scene(scene_text)
                parsed_scene['source_movie'] = os.path.basename(filepath)
                all_parsed_scenes.append(parsed_scene)
    print("-" * 20)



output_filename = 'tarantino_dataset_structured.json'
output_path = os.path.join(script_folder_path, output_filename)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_parsed_scenes, f, indent=2)

print(f"All scripts processed.")
print(f"Created '{output_filename}' with a total of {len(all_parsed_scenes)} scenes.")


Loaded pulp_fiction.txt
   - Split into 97 scenes.
--------------------
Loaded reservior_dogs.txt
   - Split into 48 scenes.
--------------------
Loaded jb.txt
   - Split into 109 scenes.
--------------------
Loaded ing_bastards.txt
   - Split into 58 scenes.
--------------------
Loaded nbk.txt
   - Split into 104 scenes.
--------------------
All scripts processed.
Created 'tarantino_dataset_structured.json' with a total of 416 scenes.


In [None]:
import json
import os
import re

script_folder_path = '/content/drive/My Drive/movie_script_project/raw_txt/'
structured_dataset_path = os.path.join(script_folder_path, 'tarantino_dataset_structured.json')

sample_scene = None
try:
    with open(structured_dataset_path, 'r', encoding='utf-8') as f:
        all_parsed_scenes = json.load(f)
        if all_parsed_scenes:

            sample_scene = all_parsed_scenes[0]
            print("Successfully loaded dataset and selected the first scene for testing.")
        else:
            print("ERROR: The dataset file is empty.")
except FileNotFoundError:
    print(f"ERROR: Could not find the file at {structured_dataset_path}")

def generate_simple_instruction(parsed_scene: dict) -> str:
    """Generates a simple, natural language instruction for a scene."""
    heading = parsed_scene['scene_heading']
    location = re.sub(r'^(INT\.?|EXT\.?)\s*', '', heading).strip()
    location = re.sub(r'\s{2,}', ' ', location).lower()

    characters = sorted(list(set(block['character'] for block in parsed_scene['dialogue_blocks'])))
    character_str = ", ".join(characters)

    if not character_str:
        return f"Write a script scene describing an event at {location}."
    else:
        return f"Write a movie scene that takes place at {location}. It should feature dialogue between {character_str}."

def format_scene_as_response(parsed_scene: dict) -> str:
    """Reconstructs the script text from the parsed dictionary."""
    response_parts = [parsed_scene['scene_heading']]


    current_dialogue_index = 0
    current_action_index = 0


    if parsed_scene['action_lines']:
        response_parts.append("\n" + "\n".join(parsed_scene['action_lines']))

    if parsed_scene['dialogue_blocks']:
         for block in parsed_scene['dialogue_blocks']:

            response_parts.append(f"\n\n    {block['character']}\n    {block['dialogue']}")

    return "\n".join(response_parts)

if sample_scene:
    print("\n" + "="*50)
    print(" 'BEFORE' - THE STRUCTURED JSON SNIPPET")
    print("="*50)
    print(json.dumps(sample_scene, indent=2))

    instruction = generate_simple_instruction(sample_scene)
    response = format_scene_as_response(sample_scene)

    final_example = {
        "instruction": instruction,
        "response": response
    }


    print("\n" + "="*50)
    print(" 'AFTER' - THE FINAL TRAINING EXAMPLE")
    print("="*50)
    print(json.dumps(final_example, indent=2))

✅ Successfully loaded dataset and selected the first scene for testing.

 'BEFORE' - THE STRUCTURED JSON SNIPPET
{
  "scene_heading": "INT. COFFEE SHOP   MORNING",
  "action_lines": [
    "A normal Denny's, Spires-like coffee shop in Los Angeles.",
    "It's about 9:00 in the morning. While the place isn't jammed,",
    "there's a healthy number of people drinking coffee, munching",
    "on bacon and eating eggs.",
    "Two of these people are a YOUNG MAN and a YOUNG WOMAN. The",
    "Young Man has a slight working-class English accent and,",
    "like his fellow countryman, smokes cigarettes like they're",
    "going out of style.",
    "It is impossible to tell where the Young Woman is from or",
    "how old she is; everything she does contradicts something",
    "she did. The boy and girl sit in a booth. Their dialogue is",
    "to be said in a rapid pace \"HIS GIRL FRIDAY\" fashion.",
    "The boy and girl laugh, their laughter putting a pause in",
    "there, back and forth.",
   

In [None]:
!pip install -q groq

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m122.9/131.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.1/131.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Install the Groq library


import os
import json
import re
import time
from groq import Groq
from tqdm.auto import tqdm

client = Groq(api_key="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")


script_folder_path = '/content/drive/My Drive/movie_script_project/raw_txt/'
structured_dataset_path = os.path.join(script_folder_path, 'tarantino_dataset_structured.json')
output_filename = 'tarantino_dataset_for_finetuning.json'
output_path = os.path.join(script_folder_path, output_filename)

try:
    with open(structured_dataset_path, 'r', encoding='utf-8') as f:
        all_parsed_scenes = json.load(f)
    print(f"Loaded {len(all_parsed_scenes)} scenes from Google Drive.")
except Exception as e:
    print(f"ERROR loading dataset: {e}")
    all_parsed_scenes = []


def format_scene_as_response(parsed_scene: dict) -> str:

    response_parts = [parsed_scene['scene_heading']]
    if parsed_scene['action_lines']:
        response_parts.append("\n" + "\n".join(parsed_scene['action_lines']))
    if parsed_scene['dialogue_blocks']:
         for block in parsed_scene['dialogue_blocks']:
            response_parts.append(f"\n\n    {block['character']}\n    {block['dialogue']}")
    full_text = "\n".join(response_parts)
    return re.sub(r'\n{3,}', '\n\n', full_text)

def generate_instruction_with_groq(scene_text: str, retries=3, delay=5) -> str:
    """Uses Groq to create a high-quality, Tarantino-esque instruction."""

    meta_prompt = f"""
    You are an expert screenwriter and data labeling assistant... [The rest of your v3 meta-prompt]
    ...
    Here is the scene to analyze:
    ---
    {scene_text}
    ---
    Now, provide only the single, high-quality, Tarantino-esque creative prompt:
    """

    for i in range(retries):
        try:
            chat_completion = client.chat.completions.create(
                messages=[{"role": "user", "content": meta_prompt}],
                model="llama3-8b-8192",
            )
            return chat_completion.choices[0].message.content.strip()
        except Exception as e:
            print(f"   - An error occurred: {e}. Waiting for {delay} seconds before retry {i+1}/{retries}...")
            time.sleep(delay)
            delay *= 2
    return "Error: Exceeded max retries."


try:
    with open(output_path, 'r', encoding='utf-8') as f:
        final_training_data = json.load(f)
    print(f"Resuming from a previous run. Found {len(final_training_data)} existing examples.")
except FileNotFoundError:
    final_training_data = []
    print("Starting a fresh run.")

start_index = len(final_training_data)

if all_parsed_scenes:
    print(f"\nStarting processing from scene {start_index + 1}...")
    scenes_to_process = all_parsed_scenes[start_index:]

    for i, scene in enumerate(tqdm(scenes_to_process)):
        response_text = format_scene_as_response(scene)
        instruction_text = generate_instruction_with_groq(response_text)

        if "Error:" not in instruction_text:
            final_training_data.append({"instruction": instruction_text, "response": response_text})

        if (i + 1) % 10 == 0:
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(final_training_data, f, indent=2)
            tqdm.write(f"Progress saved! {len(final_training_data)} examples complete.")

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(final_training_data, f, indent=2)

print(f"\nSuccess! Created '{output_filename}' --{len(final_training_data)} examples.")
print(f"dataset is now saved in your Google Drive at: {output_path}")

✅ Loaded 416 scenes from Google Drive.
✅ Resuming from a previous run. Found 29 existing examples.

Starting processing from scene 30...


  0%|          | 0/387 [00:00<?, ?it/s]

   - ✅ Progress saved! 39 examples complete.
   - ✅ Progress saved! 49 examples complete.
   - ✅ Progress saved! 59 examples complete.
   - ✅ Progress saved! 69 examples complete.
   - ✅ Progress saved! 79 examples complete.
   - ✅ Progress saved! 89 examples complete.
   - ✅ Progress saved! 99 examples complete.
   - ✅ Progress saved! 109 examples complete.
   - ✅ Progress saved! 119 examples complete.
   - ✅ Progress saved! 129 examples complete.
   - ✅ Progress saved! 139 examples complete.
   - ✅ Progress saved! 149 examples complete.
   - ✅ Progress saved! 159 examples complete.
   - ✅ Progress saved! 169 examples complete.
   - ✅ Progress saved! 179 examples complete.
   - ✅ Progress saved! 189 examples complete.
   - ✅ Progress saved! 199 examples complete.
   - ✅ Progress saved! 209 examples complete.
   - ✅ Progress saved! 219 examples complete.
   - ✅ Progress saved! 229 examples complete.
   - ✅ Progress saved! 239 examples complete.
   - ✅ Progress saved! 249 examples compl

In [None]:
import json
import os

script_folder_path = '/content/drive/My Drive/movie_script_project/raw_txt/'
final_dataset_path = os.path.join(script_folder_path, 'tarantino_dataset_for_finetuning.json')

try:
    with open(final_dataset_path, 'r', encoding='utf-8') as f:
        final_data = json.load(f)
    print(f"Successfully loaded {len(final_data)} examples.")
except Exception as e:
    print(f"ERROR loading dataset: {e}")
    final_data = []


if final_data:
    for i, example in enumerate(final_data[:3]):
        print("\n" + "="*50)
        print(f"EXAMPLE #{i+1}")
        print("="*50)

        print("\n--- INSTRUCTION ---")
        print(example['instruction'])

        print("\n--- RESPONSE ---")
        print(example['response'])

✅ Successfully loaded 416 examples.

EXAMPLE #1

--- INSTRUCTION ---
Craft a scene where two low-rent, Bonnie and Clyde-esque lovers, Pumpkin and Honey Bunny, meticulously plan a diner robbery, their rapid-fire, witty dialogue a chaotic dance between cynicism, pragmatism, and burgeoning psychopathy.  Focus on the escalation of their plan from a hesitant, almost reluctant agreement to a gleeful embrace of chaotic violence, highlighting the absurd logic behind their target selection and the darkly comedic contrast between their meticulous planning and their inherent instability.  The scene should end with the eruption of their robbery, showcasing Pumpkin's controlled aggression and Honey Bunny's unhinged mania.  Infuse the scene with Tarantino's trademark sharp wit, unexpected violence, and nonlinear storytelling techniques.

--- RESPONSE ---
INT. COFFEE SHOP   MORNING

A normal Denny's, Spires-like coffee shop in Los Angeles.
It's about 9:00 in the morning. While the place isn't jammed,