In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("WenhaoWang/VidProM")

# Convert dataset to pandas DataFrame
df = pd.DataFrame(dataset['train'])  # Ensure the correct split is used
# df = pd.read_csv("VidProM_unique.csv")

In [None]:
# Display the first few rows of the dataframe
df.head(10)



In [None]:
df.columns



In [5]:
df.columns



In [6]:
# making a df with only the prompts and video ids (uuid)
dd = df[['prompt', 'uuid']]
dd.head()



In [19]:
print(f"There are {len(dd['prompt'])} unique prompts")
# print(f"We wil start by sampling {np.floor(.0001 * len(dd['prompt']))} prompts")



In [22]:
# choosing the first 100 prompts to start with
first = dd.head(100)
display(first)
for i, row in first.iterrows():
    print(f"Prompt: {row['prompt']}")
    pass





## Notes
- can see that prompts have different formats 
    - some specifyu styles of videos, some specify screen size
- prompt length varies a LOT 
- not consistent capitalizization & some have symbols -- i.e &quot

In [31]:
# adding length of prompt to the dataframe
first['prompt_length'] = first['prompt'].apply(lambda x: len(x.split()))
display(first['prompt_length'].describe())
# checking the distribution of prompt lengths
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
plt.bar(range(len(first['prompt_length'])), first['prompt_length'])
plt.title('Distribution of Prompt Lengths')
plt.show()







## Watching the videos

I was struggluing to find the video from the uuid, so instead i found the uuid/prompt fromnt he video -- after downloading them 

In [59]:
video_id = '0a0a5078-b2b1-56da-92ca-a0fb07bc33c0'
def prompt_from_id(video_id):
    """
    Given a video_id (uuid), return the corresponding prompt from the dataframe.
    """
    entry = dd[dd['uuid'] == video_id]
    if not entry.empty:
        return entry['prompt'].values[0], int(entry.index[0])  # Return the prompt and its index in the dataframe
    else:
        return None

In [60]:
result = prompt_from_id(video_id)
print(result)



In [84]:
def process_video_files(video_files, file_beginnings):
    """
    Processes video files, extracts UUIDs, prefixes, and associated prompts,
    and stores them in a structured data format.

    Args:
        video_files (list): A list of video file names.
        file_beginnings (list): A list of prefixes for the video files.
        prompt_from_id (function): A function that takes a UUID and returns a tuple
                                    (prompt, index) or None if no prompt is found.

    Returns:
        list: A list of dictionaries, where each dictionary contains the UUID, prefix, and prompt.
    """

    data = []  # Initialize an empty list to store the structured data

    for file in video_files:
        for prefix in file_beginnings:
            if file.startswith(prefix):
                uuid = file[len(prefix):].split('.')[0]  # Remove prefix and file extension

                prompt_info = prompt_from_id(uuid)
                if prompt_info:
                    prompt, index = prompt_info
                    data.append({
                        "uuid": uuid,
                        "prefix": prefix,
                        "prompt": prompt
                    })
                else:
                    data.append({
                        "uuid": uuid,
                        "prefix": prefix,
                        "prompt": None
                    })
                    print(f"No prompt found for uuid: {uuid}")

                break  # Move to the next file after finding a matching prefix
    # save data to csv and return 
    data_df = pd.DataFrame(data)
    data_df.to_csv('processed_video_data.csv', index=False)  # Save to CSV for future use
    return data

In [85]:
video_folder = '/home/bia/Documents/genvid/sample_videos' # This is where the sample videos are stored
import os

# print the contents of the video folder
def list_video_files(folder):
    """
    List all video files in the given folder.
    """
    if not os.path.exists(folder):
        print(f"Folder {folder} does not exist.")
        return []
    
    files = os.listdir(folder)
    video_files = [f for f in files if f.endswith(('.mp4', '.avi', '.mov', '.mkv'))]
    return video_files

In [86]:
file_beginnings = ['t2vz-', 'pika-', 'vc-'] # these are the prefixes of the video files in the sample folder
video_files = list_video_files(video_folder)

result = process_video_files(video_files, file_beginnings)

### Notes

looking at the videos in the sample, we can see that 
- some videos are watermarked 
- the videos from t2vz are not the most realistic -- for example, a video uuid: 0a64622a-f379-5f6d-a7f5-4d8e9d561f37 from t2vz had prompt: Scene 7: Masked Dance  Brief snippets of hands and elegant clothing, showcasing the character dancing with different masked partners.  
looks like
<img src = "t2vz_mask_example.png" width = "200px"/>

In [None]:
baby_example = process_video_files(['t2vz-0a1e22f6-5863-5c21-8514-78baee8d8a92.mp4'], file_beginnings)  # Call the function to process video files
print(baby_example)



this video is also pretty unrealistic -- a lot of the t2vz videos are more choppy

In [82]:
video = ['t2vz-0abf1bfb-7300-5912-886a-b7b785cb7712.mp4']
print(process_video_files(video, file_beginnings))  # Call the function to process video files and get structured data




## Looking at the prompts
- using the prompts generated fromt hwe vdieos we can watch so we can see them all