In [None]:
import pandas as pd
import numpy as np
import random
from ast import literal_eval
import os

# Load your survey data
survey_df = pd.read_csv('dropoffs.csv')

# Load your cleaned IMDB dataset
imdb_df = pd.read_csv('imdb_data.csv')

# Verify the datasets
print("Survey columns:", survey_df.columns.tolist()[:10])  # Show first 10 columns
print("IMDB columns:", imdb_df.columns.tolist())
print(f"Survey shape: {survey_df.shape}")
print(f"IMDB shape: {imdb_df.shape}")

# Fix column name mapping based on actual IMDB data
# The IMDB data uses 'duration' instead of 'runtime', 'imdb_score' instead of 'rating', etc.
genre_mapping = {
    'Action': 'Action',
    'Comedy': 'Comedy', 
    'Drama': 'Drama',
    'Horror': 'Horror',
    'Romance': 'Romance',
    'Science Fiction': 'Sci-Fi',
    'Science Fiction/Sci-Fi': 'Sci-Fi',
    'Documentary': 'Documentary',
    'Contemporary': 'Drama',
    'Historical': 'History',
    'Thriller': 'Thriller',
    'Family': 'Family',
    'Adventure': 'Adventure',
    'Fantasy': 'Fantasy'
}

# Drop reasons from your survey
drop_reasons = [
    'Boring or uninteresting plot',
    'Didn\'t like the characters or acting',
    'Too long or slow pacing',
    'Technical issues',
    'Distractions or interruptions',
    'Not in the right mood'
]

def select_movies(genres, imdb_df, n=5, finished=True, age_group=None):
    """Select movies based on genres and other factors"""
    valid_movies = imdb_df.copy()
    
    # Convert survey genres to IMDB genres
    imdb_genres = []
    for g in genres:
        mapped_genre = genre_mapping.get(g, None)
        if mapped_genre:
            imdb_genres.append(mapped_genre)
    
    # Filter by genre if we have valid genres
    if imdb_genres:
        genre_filter = valid_movies['genres'].str.contains('|'.join(imdb_genres), na=False, case=False)
        valid_movies = valid_movies[genre_filter]
    
    # Remove movies with missing essential data
    valid_movies = valid_movies.dropna(subset=['movie_title', 'title_year', 'imdb_score'])
    
    # Additional filters based on whether movies were finished
    if not finished:
        # For unfinished movies, prefer longer and lower rated
        if 'duration' in valid_movies.columns:
            valid_movies = valid_movies[valid_movies['duration'] > 120]  # longer movies
        valid_movies = valid_movies[valid_movies['imdb_score'] < 7.0]  # lower rated
    else:
        # For finished movies, prefer higher rated
        valid_movies = valid_movies[valid_movies['imdb_score'] > 6.5]  # higher rated
    
    # Age-based filtering
    if age_group and 'title_year' in valid_movies.columns:
        current_year = 2025
        if age_group in ['12-17', '18-24']:
            valid_movies = valid_movies[valid_movies['title_year'] > current_year - 15]
        elif age_group in ['25-34']:
            valid_movies = valid_movies[valid_movies['title_year'] > current_year - 25]
    
    # If not enough movies, relax criteria
    if len(valid_movies) < n:
        valid_movies = imdb_df.copy()
        valid_movies = valid_movies.dropna(subset=['movie_title', 'title_year', 'imdb_score'])
        if imdb_genres:
            genre_filter = valid_movies['genres'].str.contains('|'.join(imdb_genres), na=False, case=False)
            valid_movies = valid_movies[genre_filter]
    
    # If still not enough, use all movies
    if len(valid_movies) < n:
        valid_movies = imdb_df.dropna(subset=['movie_title', 'title_year', 'imdb_score'])
    
    # Randomly select n movies
    if len(valid_movies) >= n:
        selected = valid_movies.sample(n)
    else:
        selected = valid_movies.sample(len(valid_movies)) if len(valid_movies) > 0 else pd.DataFrame()
    
    return selected

def generate_drop_reasons(row, movie):
    """Generate realistic reasons for dropping a movie"""
    reasons = []
    
    # Base reasons on movie attributes
    if 'duration' in movie and pd.notna(movie['duration']) and movie['duration'] > 150:
        reasons.append('Too long or slow pacing')
    if 'imdb_score' in movie and pd.notna(movie['imdb_score']) and movie['imdb_score'] < 5.0:
        reasons.append('Boring or uninteresting plot')
    
    # Add some randomness
    if random.random() > 0.7:
        reasons.append(random.choice(['Distractions or interruptions', 'Not in the right mood']))
    
    # Ensure at least one reason
    if not reasons:
        reasons.append(random.choice(drop_reasons))
    
    return '|'.join(set(reasons))

# Create a copy to avoid modifying original data
enhanced_survey_df = survey_df.copy()

# Add new columns for finished movies
for i in range(1, 6):
    enhanced_survey_df[f'finished_movie_{i}'] = ""
    enhanced_survey_df[f'finished_movie_{i}_year'] = np.nan

# Add new columns for unfinished movies and reasons
for i in range(1, 6):
    enhanced_survey_df[f'unfinished_movie_{i}'] = ""
    enhanced_survey_df[f'unfinished_movie_{i}_year'] = np.nan
    enhanced_survey_df[f'unfinished_movie_{i}_reasons'] = ""

# Find the column that contains preferred genres
genre_column = None
for col in survey_df.columns:
    if 'genre' in col.lower() and 'enjoy' in col.lower():
        genre_column = col
        break

print(f"Using genre column: {genre_column}")

# Generate synthetic data for each respondent
for idx, row in enhanced_survey_df.iterrows():
    try:
        # Get user preferences
        preferred_genres = []
        if genre_column and pd.notna(row[genre_column]):
            # Parse the genres from the survey response
            genre_text = str(row[genre_column])
            # Split by comma and clean up
            raw_genres = [g.strip() for g in genre_text.split(',')]
            preferred_genres = [g for g in raw_genres if g in genre_mapping.keys()]
        
        # If no preferred genres found, use random selection
        if not preferred_genres:
            available_genres = list(genre_mapping.keys())
            preferred_genres = random.sample(available_genres, min(3, len(available_genres)))
        
        # Get age group
        age_group = row.get('What is your age group?', None)
        
        print(f"Processing row {idx}: genres={preferred_genres}, age={age_group}")
        
        # Generate finished movies
        finished_movies = select_movies(preferred_genres, imdb_df, n=5, finished=True, age_group=age_group)
        for i, (_, movie) in enumerate(finished_movies.iterrows(), 1):
            enhanced_survey_df.at[idx, f'finished_movie_{i}'] = movie.get('movie_title', 'Unknown')
            if 'title_year' in movie:
                enhanced_survey_df.at[idx, f'finished_movie_{i}_year'] = movie['title_year']
        
        # Fill remaining finished movie slots with "Unknown"
        for i in range(len(finished_movies)+1, 6):
            enhanced_survey_df.at[idx, f'finished_movie_{i}'] = "Unknown"
        
        # Generate unfinished movies
        unfinished_movies = select_movies(preferred_genres, imdb_df, n=5, finished=False, age_group=age_group)
        for i, (_, movie) in enumerate(unfinished_movies.iterrows(), 1):
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}'] = movie.get('movie_title', 'Unknown')
            if 'title_year' in movie:
                enhanced_survey_df.at[idx, f'unfinished_movie_{i}_year'] = movie['title_year']
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}_reasons'] = generate_drop_reasons(row, movie)
        
        # Fill remaining unfinished movie slots with "Unknown"
        for i in range(len(unfinished_movies)+1, 6):
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}'] = "Unknown"
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}_reasons'] = random.choice(drop_reasons)
    
    except Exception as e:
        print(f"Error processing row {idx}: {str(e)}")
        # Fill with default values on error
        for i in range(1, 6):
            enhanced_survey_df.at[idx, f'finished_movie_{i}'] = "Unknown"
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}'] = "Unknown"
            enhanced_survey_df.at[idx, f'unfinished_movie_{i}_reasons'] = random.choice(drop_reasons)
        continue

# Save the enhanced dataset with a unique filename to avoid permission issues
output_filename = 'dropoffs_seeded.csv'
enhanced_survey_df.to_csv(output_filename, index=False)

print(f"Synthetic movie data generation complete!")
print(f"Added movie data for {len(enhanced_survey_df)} respondents.")
print(f"Output saved to: {output_filename}")

# Display sample results
print("\nSample of generated data:")
sample_cols = ['finished_movie_1', 'finished_movie_2', 'unfinished_movie_1', 'unfinished_movie_1_reasons']
if all(col in enhanced_survey_df.columns for col in sample_cols):
    print(enhanced_survey_df[sample_cols].head())

Survey columns: ['Timestamp', 'What is your age group?', 'What is your gender?', 'What is the highest level of education you’ve completed? ', 'How often do you watch movies?', 'Which genres do you enjoy watching the most?  (Select up to 3)', 'Can you recall a movie you started but did not finish? (Optional: name it)', 'How do you usually discover movies you decide to watch? (Select all that apply)', "What is the movie's genre?", "What is the movie's runtime?"]
IMDB columns: ['color', 'director_name', 'num_critic_for_reviews', 'duration', 'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name', 'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name', 'movie_title', 'num_voted_users', 'cast_total_facebook_likes', 'actor_3_name', 'facenumber_in_poster', 'plot_keywords', 'movie_imdb_link', 'num_user_for_reviews', 'language', 'country', 'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score', 'aspect_ratio', 'movie_facebook_likes', 'profit', 'roi', '

In [7]:
# Simplified version - just 5 finished movies and 5 unfinished movies with reasons
import pandas as pd
import numpy as np
import random

# Load data
survey_df = pd.read_csv('dropoffs.csv')
imdb_df = pd.read_csv('imdb_data.csv')

# Simplified genre mapping
genre_mapping = {
    'Action': 'Action',
    'Comedy': 'Comedy', 
    'Drama': 'Drama',
    'Horror': 'Horror',
    'Romance': 'Romance',
    'Science Fiction': 'Sci-Fi',
    'Science Fiction/Sci-Fi': 'Sci-Fi',
    'Documentary': 'Documentary',
    'Thriller': 'Thriller',
    'Family': 'Family',
    'Adventure': 'Adventure',
    'Fantasy': 'Fantasy'
}

# Drop reasons (based on your survey data)
drop_reasons = [
    'Boring/uninteresting plot',
    'Poor acting or characters', 
    'Too long/slow pacing',
    'Technical issues (buffering, audio, etc.)',
    'Distractions or interruptions',
    'Not in the right mood'
]

def get_random_movies(imdb_df, n=5, finished=True):
    """Get random movies from IMDB dataset"""
    # Clean data - remove movies with missing titles
    clean_movies = imdb_df.dropna(subset=['movie_title']).copy()
    
    if finished:
        # For finished movies, prefer higher rated ones
        if len(clean_movies[clean_movies['imdb_score'] > 6.5]) >= n:
            clean_movies = clean_movies[clean_movies['imdb_score'] > 6.5]
    else:
        # For unfinished movies, include all but maybe prefer lower rated
        pass
    
    # Sample n movies
    if len(clean_movies) >= n:
        return clean_movies.sample(n)['movie_title'].tolist()
    else:
        return clean_movies['movie_title'].tolist()

# Create enhanced survey dataframe
enhanced_df = survey_df.copy()

# Add columns for finished movies
for i in range(1, 6):
    enhanced_df[f'finished_movie_{i}'] = ""

# Add columns for unfinished movies and their drop reasons
for i in range(1, 6):
    enhanced_df[f'unfinished_movie_{i}'] = ""
    enhanced_df[f'unfinished_movie_{i}_reason'] = ""

print(f"Processing {len(enhanced_df)} survey responses...")

# Generate movies for each respondent
for idx in range(len(enhanced_df)):
    # Generate 5 finished movies
    finished_movies = get_random_movies(imdb_df, n=5, finished=True)
    for i, movie in enumerate(finished_movies, 1):
        enhanced_df.at[idx, f'finished_movie_{i}'] = movie
    
    # Fill remaining slots if less than 5 movies found
    for i in range(len(finished_movies)+1, 6):
        enhanced_df.at[idx, f'finished_movie_{i}'] = "Unknown Movie"
    
    # Generate 5 unfinished movies with reasons
    unfinished_movies = get_random_movies(imdb_df, n=5, finished=False)
    for i, movie in enumerate(unfinished_movies, 1):
        enhanced_df.at[idx, f'unfinished_movie_{i}'] = movie
        enhanced_df.at[idx, f'unfinished_movie_{i}_reason'] = random.choice(drop_reasons)
    
    # Fill remaining slots if less than 5 movies found
    for i in range(len(unfinished_movies)+1, 6):
        enhanced_df.at[idx, f'unfinished_movie_{i}'] = "Unknown Movie"
        enhanced_df.at[idx, f'unfinished_movie_{i}_reason'] = random.choice(drop_reasons)

# Save the results
output_file = 'dropoffs_with_movies_simple.csv'
enhanced_df.to_csv(output_file, index=False)

print(f"✅ Successfully generated synthetic movie data!")
print(f"📁 Saved to: {output_file}")
print(f"👥 Processed {len(enhanced_df)} respondents")

# Show sample of what was generated
print("\n📋 Sample of generated data:")
sample_columns = [
    'finished_movie_1', 'finished_movie_2', 
    'unfinished_movie_1', 'unfinished_movie_1_reason',
    'unfinished_movie_2', 'unfinished_movie_2_reason'
]

if all(col in enhanced_df.columns for col in sample_columns):
    sample_df = enhanced_df[sample_columns].head(3)
    for i, row in sample_df.iterrows():
        print(f"\nRespondent {i+1}:")
        print(f"  Finished: {row['finished_movie_1']}, {row['finished_movie_2']}")
        print(f"  Dropped: {row['unfinished_movie_1']} (Reason: {row['unfinished_movie_1_reason']})")
        print(f"           {row['unfinished_movie_2']} (Reason: {row['unfinished_movie_2_reason']})")

Processing 78 survey responses...
✅ Successfully generated synthetic movie data!
📁 Saved to: dropoffs_with_movies_simple.csv
👥 Processed 78 respondents

📋 Sample of generated data:

Respondent 1:
  Finished: Une Femme Mariée, Galaxy Quest
  Dropped: Payback (Reason: Distractions or interruptions)
           Oceans (Reason: Distractions or interruptions)

Respondent 2:
  Finished: Alive, Zodiac
  Dropped: The Young Victoria (Reason: Poor acting or characters)
           My Dog Skip (Reason: Not in the right mood)

Respondent 3:
  Finished: My Sister's Keeper, Braveheart
  Dropped: The Protector (Reason: Distractions or interruptions)
           Doomsday (Reason: Distractions or interruptions)
✅ Successfully generated synthetic movie data!
📁 Saved to: dropoffs_with_movies_simple.csv
👥 Processed 78 respondents

📋 Sample of generated data:

Respondent 1:
  Finished: Une Femme Mariée, Galaxy Quest
  Dropped: Payback (Reason: Distractions or interruptions)
           Oceans (Reason: Distract

In [8]:
# Verify the structure of the generated data
verification_df = pd.read_csv('dropoffs_with_movies_simple.csv')

print("📊 Dataset Information:")
print(f"   Total rows: {len(verification_df)}")
print(f"   Total columns: {len(verification_df.columns)}")

print("\n📝 New columns added:")
movie_columns = [col for col in verification_df.columns if 'movie' in col.lower()]
for col in movie_columns:
    print(f"   • {col}")

print(f"\n🎬 Sample data structure:")
print("Each respondent now has:")
print("   • 5 finished movies (finished_movie_1 through finished_movie_5)")
print("   • 5 unfinished movies with reasons (unfinished_movie_1 through unfinished_movie_5)")
print("   • Each unfinished movie has a corresponding reason column")

# Show one complete record
print(f"\n📋 Complete example for respondent 1:")
sample_record = verification_df.iloc[0]
print("\nFinished movies:")
for i in range(1, 6):
    movie = sample_record[f'finished_movie_{i}']
    print(f"   {i}. {movie}")

print("\nUnfinished movies with reasons:")
for i in range(1, 6):
    movie = sample_record[f'unfinished_movie_{i}']
    reason = sample_record[f'unfinished_movie_{i}_reason']
    print(f"   {i}. {movie} → {reason}")

📊 Dataset Information:
   Total rows: 78
   Total columns: 67

📝 New columns added:
   • How often do you watch movies?
   • Can you recall a movie you started but did not finish? (Optional: name it)
   • How do you usually discover movies you decide to watch? (Select all that apply)
   • What is the movie's genre?
   • What is the movie's runtime?
   • Where do you usually watch movies? (Select all that apply)
   • Who do you usually watch movies with?
   • What is your typical mood before watching a movie?
   • Why do you usually choose to watch movies? (Select all that apply)
   • Have you ever stopped watching a movie before finishing it?
   • At what point do you typically stop watching movies you drop?
   • How many times did you pause the movie?
   • In general, what are the main reasons you stop watching movies before finishing? (Select all that apply)
   • When you stop watching a movie, how often do you go back to finish it later?
   • Have you ever started watching a movie b

In [9]:
# Enhanced version with multiple reasons per movie (for checkbox questions)
import pandas as pd
import numpy as np
import random

# Load data
survey_df = pd.read_csv('dropoffs.csv')
imdb_df = pd.read_csv('imdb_data.csv')

# Expanded drop reasons for checkbox questions
drop_reasons = [
    'Boring/uninteresting plot',
    'Poor acting or characters', 
    'Too long/slow pacing',
    'Technical issues (buffering, audio, etc.)',
    'Distractions or interruptions',
    'Not in the right mood',
    'Confusing storyline',
    'Poor dialogue',
    'Uncomfortable content',
    'Already seen similar movies',
    'Lost interest in genre',
    'Low production quality'
]

def get_random_movies(imdb_df, n=5, finished=True):
    """Get random movies from IMDB dataset"""
    clean_movies = imdb_df.dropna(subset=['movie_title']).copy()
    
    if finished:
        # For finished movies, prefer higher rated ones
        if len(clean_movies[clean_movies['imdb_score'] > 6.5]) >= n:
            clean_movies = clean_movies[clean_movies['imdb_score'] > 6.5]
    
    if len(clean_movies) >= n:
        return clean_movies.sample(n)['movie_title'].tolist()
    else:
        return clean_movies['movie_title'].tolist()

def generate_multiple_reasons(min_reasons=3, max_reasons=5):
    """Generate 3-5 reasons for dropping a movie (for checkbox questions)"""
    num_reasons = random.randint(min_reasons, max_reasons)
    selected_reasons = random.sample(drop_reasons, min(num_reasons, len(drop_reasons)))
    return '|'.join(selected_reasons)  # Use pipe separator for multiple reasons

# Create enhanced survey dataframe for checkbox version
checkbox_df = survey_df.copy()

# Add columns for finished movies
for i in range(1, 6):
    checkbox_df[f'finished_movie_{i}'] = ""

# Add columns for unfinished movies and their multiple drop reasons
for i in range(1, 6):
    checkbox_df[f'unfinished_movie_{i}'] = ""
    checkbox_df[f'unfinished_movie_{i}_reasons'] = ""  # Note: plural "reasons"

print(f"🎬 Generating movies with multiple reasons for {len(checkbox_df)} respondents...")

# Generate movies for each respondent
for idx in range(len(checkbox_df)):
    # Generate 5 finished movies
    finished_movies = get_random_movies(imdb_df, n=5, finished=True)
    for i, movie in enumerate(finished_movies, 1):
        checkbox_df.at[idx, f'finished_movie_{i}'] = movie
    
    # Fill remaining slots if less than 5 movies found
    for i in range(len(finished_movies)+1, 6):
        checkbox_df.at[idx, f'finished_movie_{i}'] = "Unknown Movie"
    
    # Generate 5 unfinished movies with MULTIPLE reasons each
    unfinished_movies = get_random_movies(imdb_df, n=5, finished=False)
    for i, movie in enumerate(unfinished_movies, 1):
        checkbox_df.at[idx, f'unfinished_movie_{i}'] = movie
        checkbox_df.at[idx, f'unfinished_movie_{i}_reasons'] = generate_multiple_reasons()
    
    # Fill remaining slots if less than 5 movies found
    for i in range(len(unfinished_movies)+1, 6):
        checkbox_df.at[idx, f'unfinished_movie_{i}'] = "Unknown Movie"
        checkbox_df.at[idx, f'unfinished_movie_{i}_reasons'] = generate_multiple_reasons()

# Save the results
output_file = 'dropoffs_with_movies_checkbox.csv'
checkbox_df.to_csv(output_file, index=False)

print(f"✅ Successfully generated movie data with multiple reasons!")
print(f"📁 Saved to: {output_file}")
print(f"👥 Processed {len(checkbox_df)} respondents")

# Show sample of what was generated
print("\n📋 Sample of generated data (checkbox format):")
sample_columns = [
    'finished_movie_1', 'finished_movie_2', 
    'unfinished_movie_1', 'unfinished_movie_1_reasons',
    'unfinished_movie_2', 'unfinished_movie_2_reasons'
]

if all(col in checkbox_df.columns for col in sample_columns):
    sample_df = checkbox_df[sample_columns].head(3)
    for i, row in sample_df.iterrows():
        print(f"\nRespondent {i+1}:")
        print(f"  Finished: {row['finished_movie_1']}, {row['finished_movie_2']}")
        
        # Show multiple reasons for first unfinished movie
        reasons1 = row['unfinished_movie_1_reasons'].split('|')
        print(f"  Dropped: {row['unfinished_movie_1']}")
        print(f"    Reasons: {', '.join(reasons1)}")
        
        # Show multiple reasons for second unfinished movie  
        reasons2 = row['unfinished_movie_2_reasons'].split('|')
        print(f"  Dropped: {row['unfinished_movie_2']}")
        print(f"    Reasons: {', '.join(reasons2)}")

print(f"\n📊 Each unfinished movie now has 3-5 reasons (perfect for checkbox questions!)")
print(f"💡 Reasons are separated by '|' and can be split for individual checkboxes")

🎬 Generating movies with multiple reasons for 78 respondents...
✅ Successfully generated movie data with multiple reasons!
📁 Saved to: dropoffs_with_movies_checkbox.csv
👥 Processed 78 respondents

📋 Sample of generated data (checkbox format):

Respondent 1:
  Finished: Live and Let Die, Richard III
  Dropped: Girls Gone Dead
    Reasons: Poor acting or characters, Lost interest in genre, Not in the right mood, Confusing storyline
  Dropped: Straight A's
    Reasons: Not in the right mood, Poor acting or characters, Lost interest in genre, Already seen similar movies, Low production quality

Respondent 2:
  Finished: Joy Ride, Who Killed the Electric Car?
  Dropped: Titan A.E.
    Reasons: Poor acting or characters, Already seen similar movies, Distractions or interruptions, Confusing storyline
  Dropped: Premonition
    Reasons: Technical issues (buffering, audio, etc.), Not in the right mood, Poor dialogue

Respondent 3:
  Finished: Mississippi Mermaid, Kung Fu Panda 2
  Dropped: Reme

In [None]:
# Verify the checkbox format and show survey structure
checkbox_verification = pd.read_csv('dropoffs_with_movies_checkbox.csv')

print("📋 Survey Question Structure Example:")
print("="*60)

# Show how this would look in a survey for one respondent
sample_respondent = checkbox_verification.iloc[0]

print("PART 1: Movies You Finished Watching")
print("-" * 40)
for i in range(1, 6):
    movie = sample_respondent[f'finished_movie_{i}']
    print(f"  {i}. {movie}")

print("\nPART 2: Movies You Started But Didn't Finish")
print("-" * 50)
for i in range(1, 6):
    movie = sample_respondent[f'unfinished_movie_{i}']
    reasons_string = sample_respondent[f'unfinished_movie_{i}_reasons']
    reasons_list = reasons_string.split('|')
    
    print(f"\n  Movie {i}: {movie}")
    print(f"  Why didn't you finish this movie? (Select all that apply)")
    for j, reason in enumerate(reasons_list, 1):
        print(f"    ☑️ {reason}")

print("\n" + "="*60)
print("📊 Dataset Summary:")
print(f"   • Total respondents: {len(checkbox_verification)}")
print(f"   • Movies per person: 5 finished + 5 unfinished")
print(f"   • Reasons per unfinished movie: 3-5 (perfect for checkboxes)")

print("\n🔧 Technical Details:")
print(f"   • Reasons are stored as pipe-separated strings (|)")
print(f"   • Split on '|' to get individual checkbox options")
print(f"   • Each movie has {3}-{5} realistic drop reasons")

# Show the distribution of number of reasons
print("\n📈 Reason Count Distribution:")
reason_counts = []
for i in range(1, 6):
    col_name = f'unfinished_movie_{i}_reasons'
    counts = checkbox_verification[col_name].apply(lambda x: len(x.split('|')))
    reason_counts.extend(counts.tolist())

from collections import Counter
distribution = Counter(reason_counts)
for num_reasons, count in sorted(distribution.items()):
    print(f"   • {num_reasons} reasons: {count} instances")