In [10]:
import pandas as pd
import numpy as np

# Load the seeded responses data
print("📊 Loading responses_partially_cleaned.csv...")
df = pd.read_csv('responses_partially_cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total columns: {len(df.columns)}")

# Show basic info about the dataset
print("\n🔍 Dataset Overview:")
print(f"Number of respondents: {len(df)}")

# Display column names with numbering
print("\n📋 All columns in the dataset:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

# Show data types and missing values
print("\n📈 Data Types Summary:")
data_info = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes,
    'Non_Null_Count': df.count(),
    'Null_Count': df.isnull().sum(),
    'Unique_Values': [df[col].nunique() for col in df.columns]
})

print(data_info.head(10).to_string(index=False))

📊 Loading responses_partially_cleaned.csv...
Dataset shape: (77, 40)
Total columns: 40

🔍 Dataset Overview:
Number of respondents: 77

📋 All columns in the dataset:
 1. What is your age group?
 2. What is your gender?
 3. What is the highest level of education you’ve completed? 
 4. How often do you watch movies?
 5. Which genres do you enjoy watching the most?  (Select up to 3)
 6. Have you ever started watching a movie but did not finish it?
 7. How often do you stop watching movies before finishing them?
 8. How often do you return to watching a movie you didn’t finish?
 9. Which genres do you find yourself stopping more often before finishing? (Select all that apply)
10. How do you usually discover movies you decide to watch? (Select all that apply)
11. What device do you usually watch movies on?
12. Thinking about movies you have started but did not finish, at what point do you usually stop watching?
13. How often do you typically pause or stop the movie during viewing?
14. Why do

In [11]:
# Data Preprocessing and Discretization Functions

def discretize_categorical_responses(df):
    """
    Discretize categorical survey responses into numerical codes
    """
    processed_df = df.copy()
    discretized_columns = {}
    
    print("🔄 Starting discretization process...")
    
    # 1. Age Group Discretization
    age_col = None
    for col in df.columns:
        if 'age' in col.lower():
            age_col = col
            break
    
    if age_col and age_col in df.columns:
        age_mapping = {
            '18-24': 1,
            '25-34': 2, 
            '35-44': 3,
            '45-54': 4,
            '55-64': 5,
            '65+': 6
        }
        processed_df[f'{age_col}_discretized'] = df[age_col].map(age_mapping)
        discretized_columns['age'] = age_mapping
        print(f"✅ Discretized {age_col}")
    
    # 2. Gender Discretization
    gender_col = None
    for col in df.columns:
        if 'gender' in col.lower():
            gender_col = col
            break
    
    if gender_col and gender_col in df.columns:
        gender_mapping = {
            'Female': 1,
            'Male': 2,
            'Non-binary': 3,
            'Prefer not to say': 4,
            'Other': 5
        }
        processed_df[f'{gender_col}_discretized'] = df[gender_col].map(gender_mapping)
        discretized_columns['gender'] = gender_mapping
        print(f"✅ Discretized {gender_col}")
    
    # 3. Education Level Discretization
    education_col = None
    for col in df.columns:
        if 'education' in col.lower():
            education_col = col
            break
    
    if education_col and education_col in df.columns:
        education_mapping = {
            'High school or below': 1,
            'Some college': 2,
            'Undergraduate': 3,
            "Bachelor's Degree": 4,
            'Graduate Degree': 5,
            'PhD or higher': 6
        }
        processed_df[f'{education_col}_discretized'] = df[education_col].map(education_mapping)
        discretized_columns['education'] = education_mapping
        print(f"✅ Discretized {education_col}")
    
    # 4. Movie Watching Frequency Discretization
    frequency_col = None
    for col in df.columns:
        if 'often' in col.lower() and 'watch' in col.lower():
            frequency_col = col
            break
    
    if frequency_col and frequency_col in df.columns:
        frequency_mapping = {
            'Rarely (about once per month)': 1,
            'A few times a month (2-3 times per month)': 2,
            'Once a week': 3,
            'Several times a week (3–6 times a week)': 4,
            'Daily or almost daily': 5
        }
        processed_df[f'{frequency_col}_discretized'] = df[frequency_col].map(frequency_mapping)
        discretized_columns['frequency'] = frequency_mapping
        print(f"✅ Discretized {frequency_col}")
    
    return processed_df, discretized_columns

def process_genre_preferences(df):
    """
    Process genre preferences into binary features
    """
    processed_df = df.copy()
    
    # Find the genre preference column
    genre_col = None
    for col in df.columns:
        if 'genre' in col.lower() and 'enjoy' in col.lower():
            genre_col = col
            break
    
    if genre_col and genre_col in df.columns:
        print(f"🎬 Processing genre preferences from: {genre_col}")
        
        # Define standard genres
        genres = [
            'Action', 'Comedy', 'Drama', 'Horror', 'Romance', 
            'Science Fiction/Sci-Fi', 'Documentary', 'Thriller', 
            'Family', 'Adventure', 'Fantasy', 'Historical'
        ]
        
        # Create binary columns for each genre
        for genre in genres:
            genre_clean = genre.replace('/', '_').replace(' ', '_').lower()
            processed_df[f'likes_{genre_clean}'] = 0
        
        # Fill binary features based on responses
        for idx, response in df[genre_col].items():
            if pd.notna(response):
                response_str = str(response)
                for genre in genres:
                    if genre.lower() in response_str.lower():
                        genre_clean = genre.replace('/', '_').replace(' ', '_').lower()
                        processed_df.at[idx, f'likes_{genre_clean}'] = 1
        
        print(f"✅ Created {len(genres)} genre binary features")
    
    return processed_df

def process_movie_completion_behavior(df):
    """
    Process movie completion and dropout behavior
    """
    processed_df = df.copy()
    
    # Find completion behavior columns
    completion_cols = [col for col in df.columns if 'stop' in col.lower() or 'finish' in col.lower()]
    
    for col in completion_cols:
        if 'yes' in str(df[col].iloc[0]).lower() or 'no' in str(df[col].iloc[0]).lower():
            # Binary yes/no responses
            binary_mapping = {'Yes': 1, 'No': 0}
            processed_df[f'{col}_binary'] = df[col].map(binary_mapping)
            print(f"✅ Converted {col} to binary")
    
    # Process frequency of stopping
    stop_frequency_col = None
    for col in df.columns:
        if 'often' in col.lower() and 'stop' in col.lower():
            stop_frequency_col = col
            break
    
    if stop_frequency_col and stop_frequency_col in df.columns:
        frequency_mapping = {
            'Never': 0,
            'Rarely': 1,
            'Sometimes': 2,
            'Often': 3,
            'Always': 4
        }
        processed_df[f'{stop_frequency_col}_discretized'] = df[stop_frequency_col].map(frequency_mapping)
        print(f"✅ Discretized {stop_frequency_col}")
    
    return processed_df

def process_drop_reasons(df):
    """
    Process reasons for dropping movies into binary features
    """
    processed_df = df.copy()
    
    # Find drop reason columns
    reason_cols = [col for col in df.columns if 'reason' in col.lower()]
    
    # Standard drop reasons
    drop_reasons = [
        'Boring/uninteresting plot',
        'Poor acting or characters',
        'Too long/slow pacing', 
        'Technical issues',
        'Distractions or interruptions',
        'Not in the right mood'
    ]
    
    # Create binary columns for each reason
    for reason in drop_reasons:
        reason_clean = reason.replace('/', '_').replace(' ', '_').lower()
        processed_df[f'drops_due_to_{reason_clean}'] = 0
    
    # Process each reason column
    for col in reason_cols:
        if col in df.columns:
            for idx, response in df[col].items():
                if pd.notna(response):
                    response_str = str(response)
                    for reason in drop_reasons:
                        if any(keyword in response_str.lower() for keyword in reason.lower().split()):
                            reason_clean = reason.replace('/', '_').replace(' ', '_').lower()
                            processed_df.at[idx, f'drops_due_to_{reason_clean}'] = 1
    
    print(f"✅ Created {len(drop_reasons)} drop reason binary features")
    
    return processed_df

# Apply all preprocessing functions
print("🚀 Starting comprehensive data preprocessing...")
processed_df, discretization_mappings = discretize_categorical_responses(df)
processed_df = process_genre_preferences(processed_df)
processed_df = process_movie_completion_behavior(processed_df)
processed_df = process_drop_reasons(processed_df)

print(f"\n✅ Preprocessing completed!")
print(f"Original shape: {df.shape}")
print(f"Processed shape: {processed_df.shape}")
print(f"New features added: {processed_df.shape[1] - df.shape[1]}")

🚀 Starting comprehensive data preprocessing...
🔄 Starting discretization process...
✅ Discretized What is your age group?
✅ Discretized What is your gender?
✅ Discretized What is the highest level of education you’ve completed? 
✅ Discretized How often do you watch movies?
🎬 Processing genre preferences from: Which genres do you enjoy watching the most?  (Select up to 3)
✅ Created 12 genre binary features
✅ Converted Have you ever started watching a movie but did not finish it? to binary
✅ Converted unfinished_movie_2_reasons to binary
✅ Converted unfinished_movie_4_reasons to binary
✅ Discretized How often do you stop watching movies before finishing them?
✅ Created 6 drop reason binary features

✅ Preprocessing completed!
Original shape: (77, 40)
Processed shape: (77, 66)
New features added: 26


In [12]:
# Feature Engineering and Dataset Preparation

def create_aggregate_features(processed_df):
    """
    Create aggregate features from the processed data
    """
    df_with_features = processed_df.copy()
    
    # 1. Genre preference count
    genre_cols = [col for col in processed_df.columns if 'likes_' in col]
    df_with_features['total_liked_genres'] = processed_df[genre_cols].sum(axis=1)
    
    # 2. Drop reason count
    reason_cols = [col for col in processed_df.columns if 'drops_due_to_' in col]
    df_with_features['total_drop_reasons'] = processed_df[reason_cols].sum(axis=1)
    
    # 3. Create user engagement score
    discretized_cols = [col for col in processed_df.columns if '_discretized' in col]
    frequency_col = None
    for col in discretized_cols:
        if 'often' in col.lower() and 'watch' in col.lower():
            frequency_col = col
            break
    
    if frequency_col:
        df_with_features['engagement_score'] = (
            df_with_features[frequency_col] * df_with_features['total_liked_genres']
        )
    else:
        df_with_features['engagement_score'] = df_with_features['total_liked_genres']
    
    # 4. Create dropout tendency score
    if 'total_drop_reasons' in df_with_features.columns:
        df_with_features['dropout_tendency'] = df_with_features['total_drop_reasons'] / df_with_features['total_liked_genres'].replace(0, 1)
    
    print(f"✅ Created aggregate features:")
    print(f"   • total_liked_genres")
    print(f"   • total_drop_reasons")
    print(f"   • engagement_score")
    print(f"   • dropout_tendency")
    
    return df_with_features

def prepare_ml_features(processed_df):
    """
    Prepare final feature set for machine learning
    """
    # Identify different types of features
    discretized_features = [col for col in processed_df.columns if '_discretized' in col]
    binary_features = [col for col in processed_df.columns if '_binary' in col]
    genre_features = [col for col in processed_df.columns if 'likes_' in col]
    reason_features = [col for col in processed_df.columns if 'drops_due_to_' in col]
    aggregate_features = ['total_liked_genres', 'total_drop_reasons', 'engagement_score', 'dropout_tendency']
    
    # Combine all ML-ready features
    ml_features = discretized_features + binary_features + genre_features + reason_features + aggregate_features
    ml_features = [col for col in ml_features if col in processed_df.columns]
    
    print(f"📊 ML Feature Summary:")
    print(f"   Discretized features: {len(discretized_features)}")
    print(f"   Binary features: {len(binary_features)}")
    print(f"   Genre features: {len(genre_features)}")
    print(f"   Drop reason features: {len(reason_features)}")
    print(f"   Aggregate features: {len([f for f in aggregate_features if f in processed_df.columns])}")
    print(f"   Total ML features: {len(ml_features)}")
    
    return ml_features

def create_target_variables(processed_df):
    """
    Create target variables for different prediction tasks
    """
    df_with_targets = processed_df.copy()
    
    # Target 1: High engagement user (watches frequently and likes many genres)
    engagement_threshold = df_with_targets['engagement_score'].median()
    df_with_targets['high_engagement_user'] = (df_with_targets['engagement_score'] > engagement_threshold).astype(int)
    
    # Target 2: High dropout tendency
    if 'dropout_tendency' in df_with_targets.columns:
        dropout_threshold = df_with_targets['dropout_tendency'].median()
        df_with_targets['high_dropout_user'] = (df_with_targets['dropout_tendency'] > dropout_threshold).astype(int)
    
    # Target 3: Movie completion likelihood (based on stopping behavior)
    stop_cols = [col for col in processed_df.columns if 'stop' in col.lower() and '_binary' in col]
    if stop_cols:
        # If user frequently stops movies, they're likely to drop
        df_with_targets['likely_to_complete'] = (1 - df_with_targets[stop_cols[0]]).astype(int)
    
    target_variables = ['high_engagement_user', 'high_dropout_user', 'likely_to_complete']
    target_variables = [col for col in target_variables if col in df_with_targets.columns]
    
    print(f"🎯 Created target variables: {target_variables}")
    
    return df_with_targets, target_variables

# Apply feature engineering
print("🔧 Creating aggregate features...")
processed_df = create_aggregate_features(processed_df)

print("\n🎯 Creating target variables...")
final_df, target_variables = create_target_variables(processed_df)

print("\n📋 Preparing ML features...")
ml_feature_list = prepare_ml_features(final_df)

# Show sample of final processed data
print("\n📊 Sample of processed features:")
sample_features = ml_feature_list[:10] if len(ml_feature_list) >= 10 else ml_feature_list
display(final_df[sample_features].head())

🔧 Creating aggregate features...
✅ Created aggregate features:
   • total_liked_genres
   • total_drop_reasons
   • engagement_score
   • dropout_tendency

🎯 Creating target variables...
🎯 Created target variables: ['high_engagement_user', 'high_dropout_user']

📋 Preparing ML features...
📊 ML Feature Summary:
   Discretized features: 5
   Binary features: 3
   Genre features: 12
   Drop reason features: 6
   Aggregate features: 4
   Total ML features: 30

📊 Sample of processed features:


Unnamed: 0,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary,likes_action,likes_comedy
0,2,1.0,4,2.0,2,1,,,1,1
1,1,2.0,1,2.0,3,1,,,0,0
2,1,2.0,3,2.0,1,1,,,0,0
3,1,1.0,3,1.0,3,1,,,0,1
4,1,2.0,3,1.0,2,1,,,1,1


In [13]:
# Data Quality Checks and Validation

def perform_data_quality_checks(final_df, ml_features, target_variables):
    """
    Perform comprehensive data quality checks
    """
    print("🔍 Performing Data Quality Checks...")
    
    # 1. Check for missing values
    print(f"\n📈 Missing Values Analysis:")
    missing_counts = final_df[ml_features].isnull().sum()
    if missing_counts.sum() > 0:
        print("Features with missing values:")
        for feature, count in missing_counts[missing_counts > 0].items():
            print(f"   • {feature}: {count} missing ({count/len(final_df)*100:.1f}%)")
    else:
        print("✅ No missing values in ML features")
    
    # 2. Check target variable distribution
    print(f"\n🎯 Target Variable Distributions:")
    for target in target_variables:
        if target in final_df.columns:
            value_counts = final_df[target].value_counts()
            print(f"{target}:")
            for value, count in value_counts.items():
                print(f"   {value}: {count} ({count/len(final_df)*100:.1f}%)")
    
    # 3. Check feature distributions
    print(f"\n📊 Feature Distribution Summary:")
    print(f"Total features: {len(ml_features)}")
    
    # Count feature types
    binary_features = [f for f in ml_features if final_df[f].nunique() <= 2]
    categorical_features = [f for f in ml_features if 3 <= final_df[f].nunique() <= 10]
    continuous_features = [f for f in ml_features if final_df[f].nunique() > 10]
    
    print(f"Binary features: {len(binary_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    print(f"Continuous features: {len(continuous_features)}")
    
    # 4. Check for potential issues
    print(f"\n⚠️  Potential Issues Check:")
    
    # Check for features with zero variance
    zero_variance = [f for f in ml_features if final_df[f].var() == 0]
    if zero_variance:
        print(f"Features with zero variance: {zero_variance}")
    else:
        print("✅ No zero variance features")
    
    # Check for highly correlated features
    correlation_threshold = 0.95
    numeric_features = [f for f in ml_features if final_df[f].dtype in ['int64', 'float64']]
    if len(numeric_features) > 1:
        corr_matrix = final_df[numeric_features].corr()
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > correlation_threshold:
                    high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
        
        if high_corr:
            print(f"Highly correlated feature pairs (>{correlation_threshold}):")
            for feat1, feat2, corr in high_corr:
                print(f"   • {feat1} <-> {feat2}: {corr:.3f}")
        else:
            print("✅ No highly correlated features")
    
    return {
        'missing_values': missing_counts,
        'target_distributions': {target: final_df[target].value_counts() for target in target_variables if target in final_df.columns},
        'feature_types': {
            'binary': binary_features,
            'categorical': categorical_features, 
            'continuous': continuous_features
        }
    }

# Perform quality checks
quality_report = perform_data_quality_checks(final_df, ml_feature_list, target_variables)

# Handle missing values if any
if quality_report['missing_values'].sum() > 0:
    print("\n🔧 Handling missing values...")
    final_df[ml_feature_list] = final_df[ml_feature_list].fillna(0)
    print("✅ Missing values filled with 0")

print(f"\n📋 Final Dataset Summary:")
print(f"Total samples: {len(final_df)}")
print(f"Total features for ML: {len(ml_feature_list)}")
print(f"Target variables: {len(target_variables)}")
print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

🔍 Performing Data Quality Checks...

📈 Missing Values Analysis:
Features with missing values:
   • What is your gender?_discretized: 1 missing (1.3%)
   • How often do you watch movies?_discretized: 9 missing (11.7%)
   • unfinished_movie_2_reasons_binary: 77 missing (100.0%)
   • unfinished_movie_4_reasons_binary: 77 missing (100.0%)
   • engagement_score: 9 missing (11.7%)

🎯 Target Variable Distributions:
high_engagement_user:
   0: 43 (55.8%)
   1: 34 (44.2%)
high_dropout_user:
   0: 74 (96.1%)
   1: 3 (3.9%)

📊 Feature Distribution Summary:
Total features: 30
Binary features: 21
Categorical features: 7
Continuous features: 2

⚠️  Potential Issues Check:
Features with zero variance: ['likes_adventure', 'drops_due_to_poor_acting_or_characters', 'drops_due_to_distractions_or_interruptions', 'drops_due_to_not_in_the_right_mood']
✅ No highly correlated features

🔧 Handling missing values...
✅ Missing values filled with 0

📋 Final Dataset Summary:
Total samples: 77
Total features for ML

In [14]:
# Save Processed Data and Prepare for Machine Learning

# Create final ML-ready dataset
ml_dataset = final_df[ml_feature_list + target_variables].copy()

# Save the processed datasets
print("💾 Saving processed datasets...")

# 1. Save complete processed dataset
final_df.to_csv('seededresponses_fully_processed.csv', index=False)
print("✅ Saved complete processed dataset: seededresponses_fully_processed.csv")

# 2. Save ML-ready dataset
ml_dataset.to_csv('ml_ready_dataset.csv', index=False)
print("✅ Saved ML-ready dataset: ml_ready_dataset.csv")

# 3. Save feature and target lists for easy reference
feature_info = {
    'ml_features': ml_feature_list,
    'target_variables': target_variables,
    'discretization_mappings': discretization_mappings
}

import json
with open('feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("✅ Saved feature information: feature_info.json")

# Display final summary
print(f"\n🎉 DATA PREPROCESSING COMPLETED SUCCESSFULLY!")
print(f"=" * 60)

print(f"\n📊 FINAL DATASET SUMMARY:")
print(f"   • Original samples: {len(df)}")
print(f"   • Processed samples: {len(final_df)}")
print(f"   • Original features: {len(df.columns)}")
print(f"   • Total processed features: {len(final_df.columns)}")
print(f"   • ML-ready features: {len(ml_feature_list)}")
print(f"   • Target variables: {len(target_variables)}")

print(f"\n🎯 TARGET VARIABLES FOR PREDICTION:")
for i, target in enumerate(target_variables, 1):
    print(f"   {i}. {target}")

print(f"\n🔧 FEATURE CATEGORIES:")
feature_types = quality_report['feature_types']
print(f"   • Binary features: {len(feature_types['binary'])}")
print(f"   • Categorical features: {len(feature_types['categorical'])}")
print(f"   • Continuous features: {len(feature_types['continuous'])}")

print(f"\n📁 OUTPUT FILES:")
print(f"   • seededresponses_fully_processed.csv - Complete processed dataset")
print(f"   • ml_ready_dataset.csv - Features + targets only")
print(f"   • feature_info.json - Feature lists and mappings")

print(f"\n🚀 NEXT STEPS - READY FOR MACHINE LEARNING:")
print(f"   1. Decision Tree Based Methods")
print(f"   2. Neural Networks")
print(f"   3. Naive Bayes Classifier") 
print(f"   4. Nearest Neighbor Classification")

print(f"\n💡 SAMPLE CODE FOR NEXT STEPS:")
print(f"   # Load ML-ready data")
print(f"   ml_data = pd.read_csv('ml_ready_dataset.csv')")
print(f"   features = ml_data[{ml_feature_list[:3]}...]  # All features")
print(f"   targets = ml_data[{target_variables}]  # All targets")

# Show sample of final ML dataset
print(f"\n📋 SAMPLE OF FINAL ML DATASET:")
display(ml_dataset.head())

print(f"\n✅ Data preprocessing pipeline complete! Ready for algorithm experimentation.")

# Create Two Versions of Processed Data

def create_human_readable_dataset(final_df, original_df):
    """
    Create a clean, human-readable version with actual responses visible
    """
    print("📋 Creating human-readable dataset...")
    
    # Start with key demographic and preference columns
    readable_df = pd.DataFrame()
    
    # 1. Include key demographic info (original responses)
    demographic_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['age', 'gender', 'education']):
            demographic_cols.append(col)
            readable_df[col] = original_df[col]
    
    # 2. Include movie watching preferences (original responses)
    preference_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['genre', 'often', 'watch', 'discover']):
            preference_cols.append(col)
            readable_df[col] = original_df[col]
    
    # 3. Include completion behavior (original responses)
    behavior_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['stop', 'finish', 'pause', 'skip']):
            behavior_cols.append(col)
            readable_df[col] = original_df[col]
    
    # 4. Include drop reasons (original responses) 
    reason_cols = []
    for col in original_df.columns:
        if 'reason' in col.lower():
            reason_cols.append(col)
            readable_df[col] = original_df[col]
    
    # 5. Add segregated genre preferences (from binary features)
    print("   Creating segregated genre preferences...")
    genre_binary_cols = [col for col in final_df.columns if 'likes_' in col]
    
    # Create a readable genre list for each user
    genre_preferences = []
    for idx in range(len(final_df)):
        user_genres = []
        for col in genre_binary_cols:
            if final_df.iloc[idx][col] == 1:
                genre_name = col.replace('likes_', '').replace('_', ' ').title()
                user_genres.append(genre_name)
        genre_preferences.append(', '.join(user_genres) if user_genres else 'None specified')
    
    readable_df['Segregated_Genre_Preferences'] = genre_preferences
    
    # 6. Add segregated drop reasons (from binary features)
    print("   Creating segregated drop reasons...")
    reason_binary_cols = [col for col in final_df.columns if 'drops_due_to_' in col]
    
    drop_reason_lists = []
    for idx in range(len(final_df)):
        user_reasons = []
        for col in reason_binary_cols:
            if final_df.iloc[idx][col] == 1:
                reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                user_reasons.append(reason_name)
        drop_reason_lists.append(', '.join(user_reasons) if user_reasons else 'None specified')
    
    readable_df['Segregated_Drop_Reasons'] = drop_reason_lists
    
    # 7. Add summary metrics (human-readable)
    readable_df['Total_Liked_Genres'] = final_df['total_liked_genres']
    readable_df['Total_Drop_Reasons'] = final_df['total_drop_reasons']
    readable_df['Engagement_Level'] = pd.cut(final_df['engagement_score'], 
                                           bins=3, 
                                           labels=['Low', 'Medium', 'High'])
    
    print(f"   ✅ Created readable dataset with {len(readable_df.columns)} columns")
    return readable_df

def create_ml_discretized_dataset(final_df, ml_feature_list, target_variables):
    """
    Create discretized dataset for machine learning
    """
    print("🤖 Creating ML discretized dataset...")
    
    # Include only ML-ready features and targets
    ml_dataset = final_df[ml_feature_list + target_variables].copy()
    
    # Add respondent ID for tracking
    ml_dataset.insert(0, 'Respondent_ID', range(1, len(ml_dataset) + 1))
    
    print(f"   ✅ Created ML dataset with {len(ml_dataset.columns)} columns")
    return ml_dataset

# Create both versions
print("🚀 Creating two dataset versions...")
print("=" * 50)

# Version 1: Human-readable with actual responses
readable_dataset = create_human_readable_dataset(final_df, df)

# Version 2: Discretized for ML
ml_dataset = create_ml_discretized_dataset(final_df, ml_feature_list, target_variables)

# Save both versions
print("\n💾 Saving both dataset versions...")

# 1. Save human-readable version
readable_dataset.to_csv('responses_cleaned_human_readable.csv', index=False)
print("✅ Saved human-readable version: responses_cleaned_human_readable.csv")

# 2. Save ML discretized version  
ml_dataset.to_csv('responses_discretized_for_ml.csv', index=False)
print("✅ Saved ML discretized version: responses_discretized_for_ml.csv")

# 3. Save complete processed dataset (for reference)
final_df.to_csv('responses_complete_processed.csv', index=False)
print("✅ Saved complete processed version: responses_complete_processed.csv")

# 4. Save feature and target lists for easy reference
feature_info = {
    'ml_features': ml_feature_list,
    'target_variables': target_variables,
    'discretization_mappings': discretization_mappings,
    'readable_columns': list(readable_dataset.columns),
    'ml_columns': list(ml_dataset.columns)
}

import json
with open('feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("✅ Saved feature information: feature_info.json")

# Display summaries of both versions
print(f"\n🎉 DATA PREPROCESSING COMPLETED - TWO VERSIONS CREATED!")
print(f"=" * 60)

print(f"\n📋 VERSION 1 - HUMAN READABLE DATASET:")
print(f"   File: responses_cleaned_human_readable.csv")
print(f"   Purpose: Visual inspection, reporting, presentations")
print(f"   Samples: {len(readable_dataset)}")
print(f"   Columns: {len(readable_dataset.columns)}")
print(f"   Content: Original responses + segregated preferences/reasons")

print(f"\n🤖 VERSION 2 - ML DISCRETIZED DATASET:")
print(f"   File: responses_discretized_for_ml.csv") 
print(f"   Purpose: Machine learning algorithms")
print(f"   Samples: {len(ml_dataset)}")
print(f"   Columns: {len(ml_dataset.columns)}")
print(f"   Content: Numerical features + target variables")

print(f"\n🎯 TARGET VARIABLES FOR ML:")
for i, target in enumerate(target_variables, 1):
    print(f"   {i}. {target}")

print(f"\n📁 ALL OUTPUT FILES:")
print(f"   • responses_cleaned_human_readable.csv - For humans")
print(f"   • responses_discretized_for_ml.csv - For algorithms")
print(f"   • responses_complete_processed.csv - Complete version")
print(f"   • feature_info.json - Metadata and mappings")

# Show samples of both versions
print(f"\n📊 SAMPLE OF HUMAN-READABLE VERSION:")
display(readable_dataset[['Segregated_Genre_Preferences', 'Segregated_Drop_Reasons', 'Engagement_Level']].head(3))

print(f"\n📊 SAMPLE OF ML DISCRETIZED VERSION:")
sample_ml_cols = [col for col in ml_dataset.columns if col != 'Respondent_ID'][:8]
display(ml_dataset[['Respondent_ID'] + sample_ml_cols].head(3))

print(f"\n🚀 READY FOR NEXT STEPS:")
print(f"   • Use human-readable version for analysis and reporting")
print(f"   • Use discretized version for 4 ML algorithms:")
print(f"     1. Decision Tree Based Methods")
print(f"     2. Neural Networks")
print(f"     3. Naive Bayes Classifier")
print(f"     4. Nearest Neighbor Classification")

print(f"\n✅ Both datasets ready for use!")

💾 Saving processed datasets...
✅ Saved complete processed dataset: seededresponses_fully_processed.csv
✅ Saved ML-ready dataset: ml_ready_dataset.csv
✅ Saved feature information: feature_info.json

🎉 DATA PREPROCESSING COMPLETED SUCCESSFULLY!

📊 FINAL DATASET SUMMARY:
   • Original samples: 77
   • Processed samples: 77
   • Original features: 40
   • Total processed features: 72
   • ML-ready features: 30
   • Target variables: 2

🎯 TARGET VARIABLES FOR PREDICTION:
   1. high_engagement_user
   2. high_dropout_user

🔧 FEATURE CATEGORIES:
   • Binary features: 21
   • Categorical features: 7
   • Continuous features: 2

📁 OUTPUT FILES:
   • seededresponses_fully_processed.csv - Complete processed dataset
   • ml_ready_dataset.csv - Features + targets only
   • feature_info.json - Feature lists and mappings

🚀 NEXT STEPS - READY FOR MACHINE LEARNING:
   1. Decision Tree Based Methods
   2. Neural Networks
   3. Naive Bayes Classifier
   4. Nearest Neighbor Classification

💡 SAMPLE CODE 

Unnamed: 0,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary,likes_action,likes_comedy,...,drops_due_to_too_long_slow_pacing,drops_due_to_technical_issues,drops_due_to_distractions_or_interruptions,drops_due_to_not_in_the_right_mood,total_liked_genres,total_drop_reasons,engagement_score,dropout_tendency,high_engagement_user,high_dropout_user
0,2,1.0,4,2.0,2,1,0.0,0.0,1,1,...,1,1,1,1,3,6,6.0,2.0,0,0
1,1,2.0,1,2.0,3,1,0.0,0.0,0,0,...,1,1,1,1,3,6,6.0,2.0,0,0
2,1,2.0,3,2.0,1,1,0.0,0.0,0,0,...,1,0,1,1,3,5,6.0,1.666667,0,0
3,1,1.0,3,1.0,3,1,0.0,0.0,0,1,...,1,1,1,1,3,6,3.0,2.0,0,0
4,1,2.0,3,1.0,2,1,0.0,0.0,1,1,...,1,1,1,1,3,6,3.0,2.0,0,0



✅ Data preprocessing pipeline complete! Ready for algorithm experimentation.
🚀 Creating two dataset versions...
📋 Creating human-readable dataset...
   Creating segregated genre preferences...
   Creating segregated drop reasons...
   ✅ Created readable dataset with 44 columns
🤖 Creating ML discretized dataset...
   ✅ Created ML dataset with 33 columns

💾 Saving both dataset versions...
✅ Saved human-readable version: responses_cleaned_human_readable.csv
✅ Saved ML discretized version: responses_discretized_for_ml.csv
✅ Saved complete processed version: responses_complete_processed.csv
✅ Saved feature information: feature_info.json

🎉 DATA PREPROCESSING COMPLETED - TWO VERSIONS CREATED!

📋 VERSION 1 - HUMAN READABLE DATASET:
   File: responses_cleaned_human_readable.csv
   Purpose: Visual inspection, reporting, presentations
   Samples: 77
   Columns: 44
   Content: Original responses + segregated preferences/reasons

🤖 VERSION 2 - ML DISCRETIZED DATASET:
   File: responses_discretize

Unnamed: 0,Segregated_Genre_Preferences,Segregated_Drop_Reasons,Engagement_Level
0,"Action, Comedy, Historical","Boring Uninteresting Plot, Poor Acting Or Char...",Low
1,"Horror, Science Fiction Sci-Fi, Documentary","Boring Uninteresting Plot, Poor Acting Or Char...",Low
2,"Drama, Romance, Historical","Boring Uninteresting Plot, Poor Acting Or Char...",Low



📊 SAMPLE OF ML DISCRETIZED VERSION:


Unnamed: 0,Respondent_ID,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary
0,1,2,1.0,4,2.0,2,1,0.0,0.0
1,2,1,2.0,1,2.0,3,1,0.0,0.0
2,3,1,2.0,3,2.0,1,1,0.0,0.0



🚀 READY FOR NEXT STEPS:
   • Use human-readable version for analysis and reporting
   • Use discretized version for 4 ML algorithms:
     1. Decision Tree Based Methods
     2. Neural Networks
     3. Naive Bayes Classifier
     4. Nearest Neighbor Classification

✅ Both datasets ready for use!


In [15]:
# Analysis: Checkbox Response Handling in Survey Data

def analyze_checkbox_responses(df):
    """
    Analyze how checkbox responses are handled and demonstrate the need for separation
    """
    print("🔍 ANALYZING CHECKBOX RESPONSE HANDLING")
    print("=" * 50)
    
    # Find checkbox-style columns (Select all that apply)
    checkbox_columns = []
    for col in df.columns:
        if 'select all' in col.lower() or 'apply' in col.lower():
            checkbox_columns.append(col)
    
    if checkbox_columns:
        print(f"📋 Found {len(checkbox_columns)} checkbox-style columns:")
        for i, col in enumerate(checkbox_columns, 1):
            print(f"   {i}. {col}")
        
        # Analyze the pause movie column specifically
        pause_col = None
        for col in checkbox_columns:
            if 'pause' in col.lower():
                pause_col = col
                break
        
        if pause_col:
            print(f"\n🎯 DETAILED ANALYSIS: '{pause_col}'")
            print(f"Raw responses (first 5):")
            for i, response in enumerate(df[pause_col].head().values):
                print(f"   {i+1}. {response}")
            
            print(f"\n📊 PROBLEMS WITH COMBINED CHECKBOX DATA:")
            print(f"   1. ANALYSIS DIFFICULTY:")
            print(f"      • Raw: 'Lost focus or distracted, Feeling bored'")
            print(f"      • Hard to count how many people selected 'Lost focus'")
            print(f"      • Can't easily calculate percentages for each reason")
            
            print(f"\n   2. MACHINE LEARNING ISSUES:")
            print(f"      • Text data can't be used directly in algorithms")
            print(f"      • Different combinations create unique categories")
            print(f"      • No way to measure similarity between responses")
            
            print(f"\n   3. STATISTICAL ANALYSIS PROBLEMS:")
            print(f"      • Can't correlate individual reasons with other variables")
            print(f"      • Can't build predictive models for specific behaviors")
            
            print(f"\n✅ SOLUTION: BINARY SEPARATION")
            print(f"   Original column: '{pause_col}'")
            print(f"   Becomes multiple binary columns:")
            
            # Show the binary columns we created
            pause_binary_cols = [col for col in final_df.columns if 'pause' in col.lower() and 'binary' in col.lower()]
            if not pause_binary_cols:
                # Look for general reason binary columns
                reason_binary_cols = [col for col in final_df.columns if 'drops_due_to_' in col or 'reason' in col.lower()]
                pause_binary_cols = reason_binary_cols[:6]  # Show first 6 as examples
            
            for i, col in enumerate(pause_binary_cols[:6], 1):
                print(f"      {i}. {col}")
            
            print(f"\n📈 BENEFITS OF SEPARATION:")
            print(f"   ✓ Each reason becomes a 0/1 feature")
            print(f"   ✓ Easy to calculate percentages: sum(column)/total")
            print(f"   ✓ ML algorithms can use binary features directly")
            print(f"   ✓ Can correlate each reason with demographics")
            print(f"   ✓ Can predict likelihood of each specific reason")
            
            # Demonstrate the separation with examples
            print(f"\n🔢 EXAMPLE TRANSFORMATION:")
            if len(df) > 0:
                sample_response = str(df[pause_col].iloc[0]) if pd.notna(df[pause_col].iloc[0]) else "Lost focus or distracted, Bathroom break"
                print(f"   Original: '{sample_response}'")
                print(f"   Becomes:")
                
                # Simulate the binary breakdown
                reasons = ["Lost focus", "Bathroom break", "Technical issues", "Bored", "Distracted"]
                for reason in reasons:
                    has_reason = 1 if any(keyword in sample_response.lower() for keyword in reason.lower().split()) else 0
                    print(f"      {reason.replace(' ', '_').lower()}: {has_reason}")
        
        return True
    else:
        print("No checkbox-style columns found in the dataset")
        return False

def demonstrate_analysis_capabilities(final_df):
    """
    Demonstrate what analyses become possible with separated checkbox data
    """
    print(f"\n📊 WHAT YOU CAN NOW DO WITH SEPARATED DATA:")
    print("=" * 50)
    
    # Find binary reason columns
    reason_cols = [col for col in final_df.columns if 'drops_due_to_' in col]
    
    if reason_cols:
        print(f"1. PERCENTAGE ANALYSIS:")
        for col in reason_cols[:3]:  # Show first 3
            if col in final_df.columns:
                percentage = (final_df[col].sum() / len(final_df)) * 100
                reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                print(f"   • {percentage:.1f}% of users drop movies due to: {reason_name}")
        
        print(f"\n2. CORRELATION ANALYSIS:")
        # Show correlation with engagement
        if 'engagement_score' in final_df.columns:
            for col in reason_cols[:3]:
                if col in final_df.columns:
                    correlation = final_df[col].corr(final_df['engagement_score'])
                    reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                    print(f"   • {reason_name} correlation with engagement: {correlation:.3f}")
        
        print(f"\n3. DEMOGRAPHIC BREAKDOWN:")
        age_col = None
        for col in final_df.columns:
            if 'age' in col.lower() and 'discretized' in col:
                age_col = col
                break
        
        if age_col and reason_cols:
            print(f"   • Can analyze which age groups drop movies for specific reasons")
            print(f"   • Can predict dropout reasons based on user demographics")
        
        print(f"\n4. MACHINE LEARNING TARGETS:")
        print(f"   • Each reason becomes a separate prediction target")
        print(f"   • Can build models to predict: 'Will this user drop due to boredom?'")
        print(f"   • Can recommend shorter movies to users who drop due to length")

# Run the analysis
print("🎯 CHECKBOX RESPONSE SEPARATION ANALYSIS")
print("This explains why we separate checkbox responses into binary columns\n")

has_checkbox = analyze_checkbox_responses(df)
if has_checkbox:
    demonstrate_analysis_capabilities(final_df)

print(f"\n💡 CONCLUSION:")
print(f"Separating checkbox responses is ESSENTIAL because:")
print(f"1. Enables quantitative analysis of individual choices")
print(f"2. Makes data compatible with machine learning algorithms") 
print(f"3. Allows correlation analysis between specific reasons and user traits")
print(f"4. Enables building targeted prediction models for each behavior")
print(f"5. Supports statistical testing and hypothesis validation")

print(f"\nWithout separation, checkbox data remains as text strings that are")
print(f"difficult to analyze statistically or use in predictive models.")

🎯 CHECKBOX RESPONSE SEPARATION ANALYSIS
This explains why we separate checkbox responses into binary columns

🔍 ANALYZING CHECKBOX RESPONSE HANDLING
📋 Found 8 checkbox-style columns:
   1. Which genres do you find yourself stopping more often before finishing? (Select all that apply)
   2. How do you usually discover movies you decide to watch? (Select all that apply)
   3. Why do you usually pause the movie? (Select all that apply)
   4. Do you usually do other things while watching movies? (Select all that apply)
   5. In general, what are the main reasons you stop watching movies before finishing? (Select all that apply)
   6. Where do you usually watch movies? (Select all that apply)
   7. Why do you usually choose to watch movies? (Select all that apply)
   8. What was the main reason you stopped watching these movies? Select all that apply.

🎯 DETAILED ANALYSIS: 'Why do you usually pause the movie? (Select all that apply)'
Raw responses (first 5):
   1. Lost focus or distracted, 

  c /= stddev[:, None]
  c /= stddev[None, :]
