In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset

# Load the dataset from Hugging Face
dataset = load_dataset("WenhaoWang/VidProM")

# Convert dataset to pandas DataFrame
df = pd.DataFrame(dataset['train'])  # Ensure the correct split is used

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# Display the first few rows of the dataframe
df.head(10)



In [7]:
df.columns



In [10]:
# making a df with only the prompts and video ids (uuid)
dd = df[['prompt', 'uuid']]
dd.head()



In [11]:
print(f"There are {len(dd['prompt'])} unique prompts")
# print(f"We wil start by sampling {np.floor(.0001 * len(dd['prompt']))} prompts")



In [12]:
# choosing the first 100 prompts to start with
first = dd.head(100)
display(first)
for i, row in first.iterrows():
    print(f"Prompt: {row['prompt']}")
    pass





## Notes
- can see that prompts have different formats 
    - some specifyu styles of videos, some specify screen size
- prompt length varies a LOT 
- not consistent capitalizization & some have symbols -- i.e &quot

In [13]:
# adding length of prompt to the dataframe
first['prompt_length'] = first['prompt'].apply(lambda x: len(x.split()))
display(first['prompt_length'].describe())
# checking the distribution of prompt lengths
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10, 6))
plt.bar(range(len(first['prompt_length'])), first['prompt_length'])
plt.title('Distribution of Prompt Lengths')
plt.show()







## Watching the videos

I was struggluing to find the video from the uuid, so instead i found the uuid/prompt fromnt he video -- after downloading them 

In [2]:
from prompt_extractor import PromptExtractor

In [5]:
# Initialize the PromptExtractor
file_beginnings = ['t2vz-', 'pika-', 'vc-', 'ms-'] # these are the prefixes of the video files in the sample folder
pe = PromptExtractor(df, file_beginnings)

# Load in video names from folder 
video_folder = '/home/bia/Documents/genvid/sample_videos' # This is where the sample videos are stored
videos = pe.list_video_files(video_folder)
print(videos)
# pe.test(video_files)
# process the videos to extract prompts
extracted_prompts = pe.process_video_files(videos)
# Display the first few extracted prompts
display(extracted_prompts.head())
extracted_prompts[extracted_prompts['uuid'] == '0aa8113f-ee4a-5086-a7a6-e1c6d8b245aa'] # check for a specific uuid to see if it worked







### Notes

looking at the videos in the sample, we can see that 
- some videos are watermarked 
- the videos from t2vz are not the most realistic -- for example, a video uuid: 0a64622a-f379-5f6d-a7f5-4d8e9d561f37 from t2vz had prompt: Scene 7: Masked Dance  Brief snippets of hands and elegant clothing, showcasing the character dancing with different masked partners.  
looks like
<img src = "t2vz_mask_example.png" width = "200px"/>

### Was trying to preocess all tgar 1videos in my format -- quit early due to time constraints (had to process 80194 videos, 1/4 took 25 mins)

In [3]:
file_beginnings = ['t2vz-', 'pika-', 'vc-', 'ms-'] # these are the prefixes of the video files in the sample folder
pe1 = PromptExtractor(df, file_beginnings)

video_folders = [
    '/home/bia/Documents/genvid/t2vz_videos_all',
    '/home/bia/Documents/genvid/pika_videos_all',
    '/home/bia/Documents/genvid/vc2_videos_all',
    '/home/bia/Documents/genvid/ms_videos_all',
]

videos = pe1.list_video_files_multi(video_folders)
print(f"there are {len(videos)} videos in the {len(video_folders)} folders")
# pe.test(video_files)
# process the videos to extract prompts
extracted_prompts = pe1.process_video_files(videos, csv_path='tar1.csv')
# Display the first few extracted prompts
extracted_prompts[extracted_prompts['uuid'] == '0aa8113f-ee4a-5086-a7a6-e1c6d8b245aa'] # check for a specific uuid to see if it worked

Listing video files in folder: /home/bia/Documents/genvid/t2vz_videos_all
Listing video files in folder: /home/bia/Documents/genvid/pika_videos_all
Listing video files in folder: /home/bia/Documents/genvid/vc2_videos_all
Listing video files in folder: /home/bia/Documents/genvid/ms_videos_all
there are 80194 videos in the 4 folders
Processing file: {file}
9f033a44-0947-559d-948c-743d59bb8e78
Processing file: {file}
23ff54a9-f1a7-53af-ae82-1d51f7b4f4ff
Processing file: {file}
b5ac58b3-d600-5cc3-be82-546fbf2c59f7
Processing file: {file}
744c6c17-8909-5744-9011-d45a172274b4
Processing file: {file}
3b86c12c-0fc3-552e-88f3-f104eb9951fb
Processing file: {file}
0c10326c-57c4-5ae4-b7e7-53098b7690ee
Processing file: {file}
9b640051-c22d-55b6-abe8-3781608fdfc3
Processing file: {file}
427cd9ff-f2fb-5b0f-b9ff-b040ce211498
Processing file: {file}
f0970578-276c-5201-abb9-b545b2b7d70e
Processing file: {file}
2a61bf27-5927-5e30-9ff7-91e9c99601d9
Processing file: {file}
0aa8113f-ee4a-5086-a7a6-e1c6d8b24

KeyboardInterrupt: 

## Looking at the prompts
- using the prompts generated from the vdieos we can watch so we can see them all

In [None]:
from sklearn.manifold import TSNE
# do semantic similarity check on the extracted prompts
from sentence_transformers import SentenceTransformer, util
# Load a pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Encode the extracted prompts
embeddings = model.encode(extracted_prompts['prompt'].tolist(), convert_to_tensor=True)
# Calculate cosine similarity matrix
cosine_similarities = util.pytorch_cos_sim(embeddings, embeddings)
similarity_matrix = cosine_similarities.cpu().numpy()
# Display the cosine similarity matrix
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 8))
sns.heatmap(similarity_matrix, cmap='viridis', annot=False)
plt.title('Cosine Similarity Matrix of Extracted Prompts')
plt.xlabel('Prompt Index')
plt.ylabel('Prompt Index')
plt.show()




In [None]:
# Get indices of the upper triangle (excluding the diagonal)
cosine_similarities = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
upper_triangle_indices = np.triu_indices(len(extracted_prompts), k=1)
similarities = cosine_similarities[upper_triangle_indices]
indices = np.argsort(similarities)[::-1]  # Sort in descending order

top_n = 10  # Number of top similar prompts to display
print(f"Top {top_n} Most Similar Prompts:\n")
for i in indices[:top_n]:
    prompt_index1 = upper_triangle_indices[0][i]
    prompt_index2 = upper_triangle_indices[1][i]
    similarity_score = similarities[i]
    print(f"Similarity: {similarity_score:.4f}")
    print(f"  Prompt 1: {extracted_prompts['prompt'].iloc[prompt_index1]}")
    print(f"  Prompt 2: {extracted_prompts['prompt'].iloc[prompt_index2]}\n")



In [54]:
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sentence_transformers import SentenceTransformer

def cluster_prompts_with_index(extracted_prompts, num_clusters=5):
    """Clusters prompts and plots prompt indices with a separate key."""

    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(extracted_prompts['prompt'].tolist())

    # K-means Clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    kmeans_labels = kmeans.fit_predict(embeddings)

    # Hierarchical Clustering
    hierarchical = AgglomerativeClustering(n_clusters=num_clusters)
    hierarchical_labels = hierarchical.fit_predict(embeddings)

    # Visualize with t-SNE
    tsne = TSNE(n_components=2, random_state=0)
    reduced_embeddings = tsne.fit_transform(embeddings)

    # Plot K-means clusters with prompt indices
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=kmeans_labels, cmap='viridis')
    plt.title("K-means Clusters (Prompt Indices)")
    for i in range(len(extracted_prompts)):
        plt.annotate(i, xy=(reduced_embeddings[i, 0], reduced_embeddings[i, 1]), fontsize=8)

    # Plot Hierarchical clusters with prompt indices
    plt.subplot(1, 2, 2)
    scatter_hierarchical = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=hierarchical_labels, cmap='viridis')
    plt.title("Hierarchical Clusters (Prompt Indices)")
    for i in range(len(extracted_prompts)):
        plt.annotate(i, xy=(reduced_embeddings[i, 0], reduced_embeddings[i, 1]), fontsize=8)

    plt.tight_layout()
    plt.show()

    # Print Prompt Index Key
    print("\nPrompt Index Key:")
    for i, prompt in enumerate(extracted_prompts['prompt'].tolist()):
        print(f"{i}: {prompt}")

# Example Usage:
# cluster_prompts_with_index(extracted_prompts, num_clusters=5)

In [None]:
cluster_prompts_with_index(extracted_prompts, num_clusters=5)





In [64]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

nltk.download('punkt_tab')
nltk.download('stopwords')


def name_clusters(extracted_prompts, num_clusters=5):
    """Names clusters using keyword frequency and TF-IDF."""

    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(extracted_prompts['prompt'].tolist())

    # K-means Clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=0)
    cluster_labels = kmeans.fit_predict(embeddings)
    print(f"Cluster labels assigned: {np.unique(cluster_labels)}") # Debugging line to check cluster labels

    clusters = [[] for _ in range(num_clusters)] # Initialize empty lists for each cluster
    # Group prompts by their cluster labels
    for i, label in enumerate(cluster_labels):
        clusters[label].append(extracted_prompts['prompt'].iloc[i])

    cluster_names = {}
    for cluster_id, prompt_list in enumerate(clusters):
        # Keyword Frequency Analysis
        all_words = []
        for prompt in prompt_list:
            tokens = nltk.word_tokenize(prompt.lower())
            stop_words = set(stopwords.words('english'))
            words = [word for word in tokens if word.isalnum() and word not in stop_words]
            all_words.extend(words)

        word_counts = Counter(all_words)
        keywords_freq = [word for word, count in word_counts.most_common(5)]  # Top 5 keywords

        # TF-IDF Analysis
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(prompt_list)
        tfidf_scores = np.array(tfidf_matrix.sum(axis=0)).flatten()
        tfidf_keywords_indices = tfidf_scores.argsort()[-5:][::-1]  # Top 5 TF-IDF keywords
        tfidf_keywords = [vectorizer.get_feature_names_out()[i] for i in tfidf_keywords_indices]

        # Combine keywords and generate name
        combined_keywords = list(set(keywords_freq + tfidf_keywords))
        cluster_name = " ".join(combined_keywords[:3])  # Use top 3 combined keywords

        cluster_names[cluster_id] = cluster_name

    return cluster_names, cluster_labels

def print_cluster_names_and_prompts(extracted_prompts, num_clusters=5):
    """Prints cluster names and the prompts belonging to each cluster."""

    cluster_names, cluster_labels = name_clusters(extracted_prompts, num_clusters)

    print("Cluster Names:")
    for cluster_id, name in cluster_names.items():
        print(f"Cluster {cluster_id}: {name}")

    print("\nPrompts per Cluster:")
    clusters = [[] for _ in range(num_clusters)]
    for i, label in enumerate(cluster_labels):
        clusters[label].append(extracted_prompts['prompt'].iloc[i])

    for cluster_id, prompt_list in enumerate(clusters):
        print(f"\nCluster {cluster_id} Prompts:")
        for prompt in prompt_list:
            print(f"- {prompt}")

# Example Usage:
print_cluster_names_and_prompts(extracted_prompts, num_clusters=10)



