In [13]:
import pandas as pd
import numpy as np
import json

# Load survey response data
df = pd.read_csv('responses_partially_cleaned.csv')

print(f"Dataset shape: {df.shape}")
print(f"Number of respondents: {len(df)}")
print(f"Total columns: {len(df.columns)}")

# Display basic dataset information
data_info = pd.DataFrame({
    'Column': df.columns,
    'Data_Type': df.dtypes,
    'Non_Null_Count': df.count(),
    'Null_Count': df.isnull().sum(),
    'Unique_Values': [df[col].nunique() for col in df.columns]
})

print("\nDataset Overview:")
print(data_info.head(10).to_string(index=False))

Dataset shape: (77, 40)
Number of respondents: 77
Total columns: 40

Dataset Overview:
                                                                                         Column Data_Type  Non_Null_Count  Null_Count  Unique_Values
                                                                        What is your age group?    object              77           0              3
                                                                           What is your gender?    object              77           0              4
                                      What is the highest level of education you’ve completed?     object              77           0              5
                                                                 How often do you watch movies?    object              77           0              5
                                 Which genres do you enjoy watching the most?  (Select up to 3)    object              77           0             73
                   

In [14]:
# Data Preprocessing and Discretization Functions

def discretize_categorical_responses(df):
    """
    Discretize categorical survey responses into numerical codes
    """
    processed_df = df.copy()
    discretized_columns = {}
    
    # Age Group Discretization
    age_col = None
    for col in df.columns:
        if 'age' in col.lower():
            age_col = col
            break
    
    if age_col and age_col in df.columns:
        age_mapping = {
            '18-24': 1, '25-34': 2, '35-44': 3,
            '45-54': 4, '55-64': 5, '65+': 6
        }
        processed_df[f'{age_col}_discretized'] = df[age_col].map(age_mapping)
        discretized_columns['age'] = age_mapping
    
    # Gender Discretization
    gender_col = None
    for col in df.columns:
        if 'gender' in col.lower():
            gender_col = col
            break
    
    if gender_col and gender_col in df.columns:
        gender_mapping = {
            'Female': 1, 'Male': 2, 'Non-binary': 3,
            'Prefer not to say': 4, 'Other': 5
        }
        processed_df[f'{gender_col}_discretized'] = df[gender_col].map(gender_mapping)
        discretized_columns['gender'] = gender_mapping
    
    # Education Level Discretization
    education_col = None
    for col in df.columns:
        if 'education' in col.lower():
            education_col = col
            break
    
    if education_col and education_col in df.columns:
        education_mapping = {
            'High school or below': 1, 'Some college': 2,
            'Undergraduate': 3, "Bachelor's Degree": 4,
            'Graduate Degree': 5, 'PhD or higher': 6
        }
        processed_df[f'{education_col}_discretized'] = df[education_col].map(education_mapping)
        discretized_columns['education'] = education_mapping
    
    # Movie Watching Frequency Discretization
    frequency_col = None
    for col in df.columns:
        if 'often' in col.lower() and 'watch' in col.lower():
            frequency_col = col
            break
    
    if frequency_col and frequency_col in df.columns:
        frequency_mapping = {
            'Rarely (about once per month)': 1,
            'A few times a month (2-3 times per month)': 2,
            'Once a week': 3,
            'Several times a week (3–6 times a week)': 4,
            'Daily or almost daily': 5
        }
        processed_df[f'{frequency_col}_discretized'] = df[frequency_col].map(frequency_mapping)
        discretized_columns['frequency'] = frequency_mapping
    
    return processed_df, discretized_columns

def process_genre_preferences(df):
    """
    Process genre preferences into binary features
    """
    processed_df = df.copy()
    
    # Find the genre preference column
    genre_col = None
    for col in df.columns:
        if 'genre' in col.lower() and 'enjoy' in col.lower():
            genre_col = col
            break
    
    if genre_col and genre_col in df.columns:
        # Define standard genres
        genres = [
            'Action', 'Comedy', 'Drama', 'Horror', 'Romance', 
            'Science Fiction/Sci-Fi', 'Documentary', 'Thriller', 
            'Family', 'Adventure', 'Fantasy', 'Historical'
        ]
        
        # Create binary columns for each genre
        for genre in genres:
            genre_clean = genre.replace('/', '_').replace(' ', '_').lower()
            processed_df[f'likes_{genre_clean}'] = 0
        
        # Fill binary features based on responses
        for idx, response in df[genre_col].items():
            if pd.notna(response):
                response_str = str(response)
                for genre in genres:
                    if genre.lower() in response_str.lower():
                        genre_clean = genre.replace('/', '_').replace(' ', '_').lower()
                        processed_df.at[idx, f'likes_{genre_clean}'] = 1
    
    return processed_df

def process_movie_completion_behavior(df):
    """
    Process movie completion and dropout behavior
    """
    processed_df = df.copy()
    
    # Find completion behavior columns
    completion_cols = [col for col in df.columns if 'stop' in col.lower() or 'finish' in col.lower()]
    
    for col in completion_cols:
        if 'yes' in str(df[col].iloc[0]).lower() or 'no' in str(df[col].iloc[0]).lower():
            binary_mapping = {'Yes': 1, 'No': 0}
            processed_df[f'{col}_binary'] = df[col].map(binary_mapping)
    
    # Process frequency of stopping
    stop_frequency_col = None
    for col in df.columns:
        if 'often' in col.lower() and 'stop' in col.lower():
            stop_frequency_col = col
            break
    
    if stop_frequency_col and stop_frequency_col in df.columns:
        frequency_mapping = {
            'Never': 0, 'Rarely': 1, 'Sometimes': 2,
            'Often': 3, 'Always': 4
        }
        processed_df[f'{stop_frequency_col}_discretized'] = df[stop_frequency_col].map(frequency_mapping)
    
    return processed_df

def process_drop_reasons(df):
    """
    Process reasons for dropping movies into binary features
    """
    processed_df = df.copy()
    
    # Find drop reason columns
    reason_cols = [col for col in df.columns if 'reason' in col.lower()]
    
    # Standard drop reasons
    drop_reasons = [
        'Boring/uninteresting plot', 'Poor acting or characters',
        'Too long/slow pacing', 'Technical issues',
        'Distractions or interruptions', 'Not in the right mood'
    ]
    
    # Create binary columns for each reason
    for reason in drop_reasons:
        reason_clean = reason.replace('/', '_').replace(' ', '_').lower()
        processed_df[f'drops_due_to_{reason_clean}'] = 0
    
    # Process each reason column
    for col in reason_cols:
        if col in df.columns:
            for idx, response in df[col].items():
                if pd.notna(response):
                    response_str = str(response)
                    for reason in drop_reasons:
                        if any(keyword in response_str.lower() for keyword in reason.lower().split()):
                            reason_clean = reason.replace('/', '_').replace(' ', '_').lower()
                            processed_df.at[idx, f'drops_due_to_{reason_clean}'] = 1
    
    return processed_df

# Apply all preprocessing functions
processed_df, discretization_mappings = discretize_categorical_responses(df)
processed_df = process_genre_preferences(processed_df)
processed_df = process_movie_completion_behavior(processed_df)
processed_df = process_drop_reasons(processed_df)

print(f"Preprocessing completed")
print(f"Original shape: {df.shape}")
print(f"Processed shape: {processed_df.shape}")
print(f"New features added: {processed_df.shape[1] - df.shape[1]}")

Preprocessing completed
Original shape: (77, 40)
Processed shape: (77, 66)
New features added: 26


In [15]:
# Feature Engineering and Dataset Preparation

def create_aggregate_features(processed_df):
    """
    Create aggregate features from the processed data
    """
    df_with_features = processed_df.copy()
    
    # Genre preference count
    genre_cols = [col for col in processed_df.columns if 'likes_' in col]
    df_with_features['total_liked_genres'] = processed_df[genre_cols].sum(axis=1)
    
    # Drop reason count
    reason_cols = [col for col in processed_df.columns if 'drops_due_to_' in col]
    df_with_features['total_drop_reasons'] = processed_df[reason_cols].sum(axis=1)
    
    # Create user engagement score
    discretized_cols = [col for col in processed_df.columns if '_discretized' in col]
    frequency_col = None
    for col in discretized_cols:
        if 'often' in col.lower() and 'watch' in col.lower():
            frequency_col = col
            break
    
    if frequency_col:
        df_with_features['engagement_score'] = (
            df_with_features[frequency_col] * df_with_features['total_liked_genres']
        )
    else:
        df_with_features['engagement_score'] = df_with_features['total_liked_genres']
    
    # Create dropout tendency score
    if 'total_drop_reasons' in df_with_features.columns:
        df_with_features['dropout_tendency'] = df_with_features['total_drop_reasons'] / df_with_features['total_liked_genres'].replace(0, 1)
    
    return df_with_features

def prepare_ml_features(processed_df):
    """
    Prepare final feature set for machine learning
    """
    discretized_features = [col for col in processed_df.columns if '_discretized' in col]
    binary_features = [col for col in processed_df.columns if '_binary' in col]
    genre_features = [col for col in processed_df.columns if 'likes_' in col]
    reason_features = [col for col in processed_df.columns if 'drops_due_to_' in col]
    aggregate_features = ['total_liked_genres', 'total_drop_reasons', 'engagement_score', 'dropout_tendency']
    
    # Combine all ML-ready features
    ml_features = discretized_features + binary_features + genre_features + reason_features + aggregate_features
    ml_features = [col for col in ml_features if col in processed_df.columns]
    
    print(f"ML Feature Summary:")
    print(f"  Discretized features: {len(discretized_features)}")
    print(f"  Binary features: {len(binary_features)}")
    print(f"  Genre features: {len(genre_features)}")
    print(f"  Drop reason features: {len(reason_features)}")
    print(f"  Aggregate features: {len([f for f in aggregate_features if f in processed_df.columns])}")
    print(f"  Total ML features: {len(ml_features)}")
    
    return ml_features

def create_target_variables(processed_df):
    """
    Create target variables for different prediction tasks
    """
    df_with_targets = processed_df.copy()
    
    # Target 1: High engagement user
    engagement_threshold = df_with_targets['engagement_score'].median()
    df_with_targets['high_engagement_user'] = (df_with_targets['engagement_score'] > engagement_threshold).astype(int)
    
    # Target 2: High dropout tendency
    if 'dropout_tendency' in df_with_targets.columns:
        dropout_threshold = df_with_targets['dropout_tendency'].median()
        df_with_targets['high_dropout_user'] = (df_with_targets['dropout_tendency'] > dropout_threshold).astype(int)
    
    # Target 3: Movie completion likelihood
    stop_cols = [col for col in processed_df.columns if 'stop' in col.lower() and '_binary' in col]
    if stop_cols:
        df_with_targets['likely_to_complete'] = (1 - df_with_targets[stop_cols[0]]).astype(int)
    
    target_variables = ['high_engagement_user', 'high_dropout_user', 'likely_to_complete']
    target_variables = [col for col in target_variables if col in df_with_targets.columns]
    
    return df_with_targets, target_variables

# Apply feature engineering
processed_df = create_aggregate_features(processed_df)
final_df, target_variables = create_target_variables(processed_df)
ml_feature_list = prepare_ml_features(final_df)

# Show sample of final processed data
print("\nSample of processed features:")
sample_features = ml_feature_list[:10] if len(ml_feature_list) >= 10 else ml_feature_list
display(final_df[sample_features].head())

ML Feature Summary:
  Discretized features: 5
  Binary features: 3
  Genre features: 12
  Drop reason features: 6
  Aggregate features: 4
  Total ML features: 30

Sample of processed features:


Unnamed: 0,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary,likes_action,likes_comedy
0,2,1.0,4,2.0,2,1,,,1,1
1,1,2.0,1,2.0,3,1,,,0,0
2,1,2.0,3,2.0,1,1,,,0,0
3,1,1.0,3,1.0,3,1,,,0,1
4,1,2.0,3,1.0,2,1,,,1,1


In [16]:
# Data Quality Checks and Validation

def perform_data_quality_checks(final_df, ml_features, target_variables):
    """
    Perform comprehensive data quality checks
    """
    print("Performing Data Quality Checks...")
    
    # Check for missing values
    print(f"\nMissing Values Analysis:")
    missing_counts = final_df[ml_features].isnull().sum()
    if missing_counts.sum() > 0:
        print("Features with missing values:")
        for feature, count in missing_counts[missing_counts > 0].items():
            print(f"  {feature}: {count} missing ({count/len(final_df)*100:.1f}%)")
    else:
        print("No missing values in ML features")
    
    # Check target variable distribution
    print(f"\nTarget Variable Distributions:")
    for target in target_variables:
        if target in final_df.columns:
            value_counts = final_df[target].value_counts()
            print(f"{target}:")
            for value, count in value_counts.items():
                print(f"  {value}: {count} ({count/len(final_df)*100:.1f}%)")
    
    # Check feature distributions
    print(f"\nFeature Distribution Summary:")
    print(f"Total features: {len(ml_features)}")
    
    binary_features = [f for f in ml_features if final_df[f].nunique() <= 2]
    categorical_features = [f for f in ml_features if 3 <= final_df[f].nunique() <= 10]
    continuous_features = [f for f in ml_features if final_df[f].nunique() > 10]
    
    print(f"Binary features: {len(binary_features)}")
    print(f"Categorical features: {len(categorical_features)}")
    print(f"Continuous features: {len(continuous_features)}")
    
    # Check for potential issues
    print(f"\nPotential Issues Check:")
    
    # Check for features with zero variance
    zero_variance = [f for f in ml_features if final_df[f].var() == 0]
    if zero_variance:
        print(f"Features with zero variance: {zero_variance}")
    else:
        print("No zero variance features")
    
    # Check for highly correlated features
    correlation_threshold = 0.95
    numeric_features = [f for f in ml_features if final_df[f].dtype in ['int64', 'float64']]
    if len(numeric_features) > 1:
        corr_matrix = final_df[numeric_features].corr()
        high_corr = []
        for i in range(len(corr_matrix.columns)):
            for j in range(i+1, len(corr_matrix.columns)):
                if abs(corr_matrix.iloc[i, j]) > correlation_threshold:
                    high_corr.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))
        
        if high_corr:
            print(f"Highly correlated feature pairs (>{correlation_threshold}):")
            for feat1, feat2, corr in high_corr:
                print(f"  {feat1} <-> {feat2}: {corr:.3f}")
        else:
            print("No highly correlated features")
    
    return {
        'missing_values': missing_counts,
        'target_distributions': {target: final_df[target].value_counts() for target in target_variables if target in final_df.columns},
        'feature_types': {
            'binary': binary_features,
            'categorical': categorical_features, 
            'continuous': continuous_features
        }
    }

# Perform quality checks
quality_report = perform_data_quality_checks(final_df, ml_feature_list, target_variables)

# Handle missing values if any
if quality_report['missing_values'].sum() > 0:
    print("\nHandling missing values...")
    final_df[ml_feature_list] = final_df[ml_feature_list].fillna(0)
    print("Missing values filled with 0")

print(f"\nFinal Dataset Summary:")
print(f"Total samples: {len(final_df)}")
print(f"Total features for ML: {len(ml_feature_list)}")
print(f"Target variables: {len(target_variables)}")
print(f"Memory usage: {final_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Performing Data Quality Checks...

Missing Values Analysis:
Features with missing values:
  What is your gender?_discretized: 1 missing (1.3%)
  How often do you watch movies?_discretized: 9 missing (11.7%)
  unfinished_movie_2_reasons_binary: 77 missing (100.0%)
  unfinished_movie_4_reasons_binary: 77 missing (100.0%)
  engagement_score: 9 missing (11.7%)

Target Variable Distributions:
high_engagement_user:
  0: 43 (55.8%)
  1: 34 (44.2%)
high_dropout_user:
  0: 74 (96.1%)
  1: 3 (3.9%)

Feature Distribution Summary:
Total features: 30
Binary features: 21
Categorical features: 7
Continuous features: 2

Potential Issues Check:
Features with zero variance: ['likes_adventure', 'drops_due_to_poor_acting_or_characters', 'drops_due_to_distractions_or_interruptions', 'drops_due_to_not_in_the_right_mood']
No highly correlated features

Handling missing values...
Missing values filled with 0

Final Dataset Summary:
Total samples: 77
Total features for ML: 30
Target variables: 2
Memory usage: 

In [19]:
# Save Processed Data and Prepare for Machine Learning

# Create final ML-ready dataset
ml_dataset = final_df[ml_feature_list + target_variables].copy()

# save complete processed dataset
final_df.to_csv('seededresponses_fully_processed.csv', index=False)

# save ML-ready dataset
ml_dataset.to_csv('ml_ready_dataset.csv', index=False)

# save feature and target lists for easy reference
feature_info = {
    'ml_features': ml_feature_list,
    'target_variables': target_variables,
    'discretization_mappings': discretization_mappings
}

import json
with open('feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print(f"\nFINAL DATASET SUMMARY:")
print(f"   • Original samples: {len(df)}")
print(f"   • Processed samples: {len(final_df)}")
print(f"   • Original features: {len(df.columns)}")
print(f"   • Total processed features: {len(final_df.columns)}")
print(f"   • ML-ready features: {len(ml_feature_list)}")
print(f"   • Target variables: {len(target_variables)}")

print(f"\nTARGET VARIABLES FOR PREDICTION:")
for i, target in enumerate(target_variables, 1):
    print(f"   {i}. {target}")

print(f"\nFEATURE CATEGORIES:")
feature_types = quality_report['feature_types']
print(f"   • Binary features: {len(feature_types['binary'])}")
print(f"   • Categorical features: {len(feature_types['categorical'])}")
print(f"   • Continuous features: {len(feature_types['continuous'])}")

print(f"\nOUTPUT FILES:")
print(f"   • seededresponses_fully_processed.csv - Complete processed dataset")
print(f"   • ml_ready_dataset.csv - Features + targets only")
print(f"   • feature_info.json - Feature lists and mappings")

print(f"\nNEXT STEPS - READY FOR MACHINE LEARNING:")
print(f"   1. Decision Tree Based Methods")
print(f"   2. Neural Networks")
print(f"   3. Naive Bayes Classifier") 
print(f"   4. Nearest Neighbor Classification")

print(f"\nSAMPLE CODE FOR NEXT STEPS:")
print(f"   # Load ML-ready data")
print(f"   ml_data = pd.read_csv('ml_ready_dataset.csv')")
print(f"   features = ml_data[{ml_feature_list[:3]}...]  # All features")
print(f"   targets = ml_data[{target_variables}]  # All targets")

# Show sample of final ML dataset
print(f"\nSAMPLE OF FINAL ML DATASET:")
display(ml_dataset.head())

print(f"\nData preprocessing pipeline complete! Ready for algorithm experimentation.")

# Dataset Creation and Export

def create_human_readable_dataset(final_df, original_df):
    """
    Create a clean, human-readable version with actual responses visible
    """
    readable_df = pd.DataFrame()
    
    # Include key demographic info (original responses)
    demographic_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['age', 'gender', 'education']):
            demographic_cols.append(col)
            readable_df[col] = original_df[col]
    
    # Include movie watching preferences (original responses)
    preference_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['genre', 'often', 'watch', 'discover']):
            preference_cols.append(col)
            readable_df[col] = original_df[col]
    
    # Include completion behavior (original responses)
    behavior_cols = []
    for col in original_df.columns:
        if any(keyword in col.lower() for keyword in ['stop', 'finish', 'pause', 'skip']):
            behavior_cols.append(col)
            readable_df[col] = original_df[col]
    
    # Include drop reasons (original responses) 
    reason_cols = []
    for col in original_df.columns:
        if 'reason' in col.lower():
            reason_cols.append(col)
            readable_df[col] = original_df[col]
    
    # Add segregated genre preferences
    genre_binary_cols = [col for col in final_df.columns if 'likes_' in col]
    genre_preferences = []
    for idx in range(len(final_df)):
        user_genres = []
        for col in genre_binary_cols:
            if final_df.iloc[idx][col] == 1:
                genre_name = col.replace('likes_', '').replace('_', ' ').title()
                user_genres.append(genre_name)
        genre_preferences.append(', '.join(user_genres) if user_genres else 'None specified')
    
    readable_df['Segregated_Genre_Preferences'] = genre_preferences
    
    # Add segregated drop reasons
    reason_binary_cols = [col for col in final_df.columns if 'drops_due_to_' in col]
    drop_reason_lists = []
    for idx in range(len(final_df)):
        user_reasons = []
        for col in reason_binary_cols:
            if final_df.iloc[idx][col] == 1:
                reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                user_reasons.append(reason_name)
        drop_reason_lists.append(', '.join(user_reasons) if user_reasons else 'None specified')
    
    readable_df['Segregated_Drop_Reasons'] = drop_reason_lists
    
    # Add summary metrics
    readable_df['Total_Liked_Genres'] = final_df['total_liked_genres']
    readable_df['Total_Drop_Reasons'] = final_df['total_drop_reasons']
    readable_df['Engagement_Level'] = pd.cut(final_df['engagement_score'], 
                                           bins=3, 
                                           labels=['Low', 'Medium', 'High'])
    
    return readable_df

def create_ml_discretized_dataset(final_df, ml_feature_list, target_variables):
    """
    Create discretized dataset for machine learning
    """
    ml_dataset = final_df[ml_feature_list + target_variables].copy()
    ml_dataset.insert(0, 'Respondent_ID', range(1, len(ml_dataset) + 1))
    return ml_dataset

# Create both dataset versions
print("Creating dataset versions...")

# Version 1: Human-readable with actual responses
readable_dataset = create_human_readable_dataset(final_df, df)

# Version 2: Discretized for ML
ml_dataset = create_ml_discretized_dataset(final_df, ml_feature_list, target_variables)

# Save datasets
print("\nSaving datasets...")

readable_dataset.to_csv('responses_cleaned_human_readable.csv', index=False)
print("Saved human-readable version: responses_cleaned_human_readable.csv")

ml_dataset.to_csv('responses_discretized_for_ml.csv', index=False)
print("Saved ML discretized version: responses_discretized_for_ml.csv")

final_df.to_csv('responses_complete_processed.csv', index=False)
print("Saved complete processed version: responses_complete_processed.csv")

# Save feature metadata
feature_info = {
    'ml_features': ml_feature_list,
    'target_variables': target_variables,
    'discretization_mappings': discretization_mappings,
    'readable_columns': list(readable_dataset.columns),
    'ml_columns': list(ml_dataset.columns)
}

with open('feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("Saved feature information: feature_info.json")

# Display summaries
print(f"\nData preprocessing completed - two versions created")
print(f"\nHuman Readable Dataset:")
print(f"  File: responses_cleaned_human_readable.csv")
print(f"  Samples: {len(readable_dataset)}")
print(f"  Columns: {len(readable_dataset.columns)}")

print(f"\nML Discretized Dataset:")
print(f"  File: responses_discretized_for_ml.csv") 
print(f"  Samples: {len(ml_dataset)}")
print(f"  Columns: {len(ml_dataset.columns)}")

print(f"\nTarget Variables:")
for i, target in enumerate(target_variables, 1):
    print(f"  {i}. {target}")

# Show samples
print(f"\nSample of Human-Readable Version:")
display(readable_dataset[['Segregated_Genre_Preferences', 'Segregated_Drop_Reasons', 'Engagement_Level']].head(3))

print(f"\nSample of ML Discretized Version:")
sample_ml_cols = [col for col in ml_dataset.columns if col != 'Respondent_ID'][:8]
display(ml_dataset[['Respondent_ID'] + sample_ml_cols].head(3))


FINAL DATASET SUMMARY:
   • Original samples: 77
   • Processed samples: 77
   • Original features: 40
   • Total processed features: 72
   • ML-ready features: 30
   • Target variables: 2

TARGET VARIABLES FOR PREDICTION:
   1. high_engagement_user
   2. high_dropout_user

FEATURE CATEGORIES:
   • Binary features: 21
   • Categorical features: 7
   • Continuous features: 2

OUTPUT FILES:
   • seededresponses_fully_processed.csv - Complete processed dataset
   • ml_ready_dataset.csv - Features + targets only
   • feature_info.json - Feature lists and mappings

NEXT STEPS - READY FOR MACHINE LEARNING:
   1. Decision Tree Based Methods
   2. Neural Networks
   3. Naive Bayes Classifier
   4. Nearest Neighbor Classification

SAMPLE CODE FOR NEXT STEPS:
   # Load ML-ready data
   ml_data = pd.read_csv('ml_ready_dataset.csv')
   features = ml_data[['What is your age group?_discretized', 'What is your gender?_discretized', 'What is the highest level of education you’ve completed? _discretiz

Unnamed: 0,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary,likes_action,likes_comedy,...,drops_due_to_too_long_slow_pacing,drops_due_to_technical_issues,drops_due_to_distractions_or_interruptions,drops_due_to_not_in_the_right_mood,total_liked_genres,total_drop_reasons,engagement_score,dropout_tendency,high_engagement_user,high_dropout_user
0,2,1.0,4,2.0,2,1,0.0,0.0,1,1,...,1,1,1,1,3,6,6.0,2.0,0,0
1,1,2.0,1,2.0,3,1,0.0,0.0,0,0,...,1,1,1,1,3,6,6.0,2.0,0,0
2,1,2.0,3,2.0,1,1,0.0,0.0,0,0,...,1,0,1,1,3,5,6.0,1.666667,0,0
3,1,1.0,3,1.0,3,1,0.0,0.0,0,1,...,1,1,1,1,3,6,3.0,2.0,0,0
4,1,2.0,3,1.0,2,1,0.0,0.0,1,1,...,1,1,1,1,3,6,3.0,2.0,0,0



Data preprocessing pipeline complete! Ready for algorithm experimentation.
Creating dataset versions...

Saving datasets...
Saved human-readable version: responses_cleaned_human_readable.csv
Saved ML discretized version: responses_discretized_for_ml.csv
Saved complete processed version: responses_complete_processed.csv
Saved feature information: feature_info.json

Data preprocessing completed - two versions created

Human Readable Dataset:
  File: responses_cleaned_human_readable.csv
  Samples: 77
  Columns: 44

ML Discretized Dataset:
  File: responses_discretized_for_ml.csv
  Samples: 77
  Columns: 33

Target Variables:
  1. high_engagement_user
  2. high_dropout_user

Sample of Human-Readable Version:


Unnamed: 0,Segregated_Genre_Preferences,Segregated_Drop_Reasons,Engagement_Level
0,"Action, Comedy, Historical","Boring Uninteresting Plot, Poor Acting Or Char...",Low
1,"Horror, Science Fiction Sci-Fi, Documentary","Boring Uninteresting Plot, Poor Acting Or Char...",Low
2,"Drama, Romance, Historical","Boring Uninteresting Plot, Poor Acting Or Char...",Low



Sample of ML Discretized Version:


Unnamed: 0,Respondent_ID,What is your age group?_discretized,What is your gender?_discretized,What is the highest level of education you’ve completed? _discretized,How often do you watch movies?_discretized,How often do you stop watching movies before finishing them?_discretized,Have you ever started watching a movie but did not finish it?_binary,unfinished_movie_2_reasons_binary,unfinished_movie_4_reasons_binary
0,1,2,1.0,4,2.0,2,1,0.0,0.0
1,2,1,2.0,1,2.0,3,1,0.0,0.0
2,3,1,2.0,3,2.0,1,1,0.0,0.0


In [18]:
# Checkbox Response Handling Analysis

def analyze_checkbox_responses(df):
    """
    Analyze how checkbox responses are handled and demonstrate the need for separation
    """
    print("Checkbox Response Handling Analysis")
    print("=" * 50)
    
    # Find checkbox-style columns
    checkbox_columns = []
    for col in df.columns:
        if 'select all' in col.lower() or 'apply' in col.lower():
            checkbox_columns.append(col)
    
    if checkbox_columns:
        print(f"Found {len(checkbox_columns)} checkbox-style columns:")
        for i, col in enumerate(checkbox_columns, 1):
            print(f"  {i}. {col}")
        
        # Analyze a checkbox column
        example_col = checkbox_columns[0] if checkbox_columns else None
        
        if example_col:
            print(f"\nExample responses from '{example_col}':")
            for i, response in enumerate(df[example_col].head(3).values):
                print(f"  {i+1}. {response}")
            
            print(f"\nProblems with combined checkbox data:")
            print(f"  1. Analysis difficulty - hard to count individual selections")
            print(f"  2. Machine learning incompatibility - text cannot be used directly")
            print(f"  3. Statistical analysis limitations - cannot correlate individual choices")
            
            print(f"\nSolution: Binary separation")
            print(f"  Original column becomes multiple binary (0/1) columns")
            print(f"  Each option becomes a separate feature")
            print(f"  Enables quantitative analysis and machine learning")
            
            print(f"\nBenefits of separation:")
            print(f"  - Easy percentage calculations")
            print(f"  - ML algorithm compatibility")
            print(f"  - Individual choice correlation analysis")
            print(f"  - Specific behavior prediction models")
            
            return True
    else:
        print("No checkbox-style columns found in the dataset")
        return False

def demonstrate_separation_benefits(final_df):
    """
    Demonstrate the analytical capabilities enabled by checkbox separation
    """
    print(f"\nAnalytical capabilities with separated data:")
    print("=" * 50)
    
    # Find binary reason columns
    reason_cols = [col for col in final_df.columns if 'drops_due_to_' in col]
    
    if reason_cols:
        print(f"1. Percentage analysis:")
        for col in reason_cols[:3]:
            if col in final_df.columns:
                percentage = (final_df[col].sum() / len(final_df)) * 100
                reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                print(f"  {percentage:.1f}% of users drop movies due to: {reason_name}")
        
        print(f"\n2. Correlation analysis:")
        if 'engagement_score' in final_df.columns:
            for col in reason_cols[:3]:
                if col in final_df.columns:
                    correlation = final_df[col].corr(final_df['engagement_score'])
                    reason_name = col.replace('drops_due_to_', '').replace('_', ' ').title()
                    print(f"  {reason_name} correlation with engagement: {correlation:.3f}")
        
        print(f"\n3. Machine learning targets:")
        print(f"  Each reason becomes a separate prediction target")
        print(f"  Enables building specific behavioral models")

# Run the analysis
print("Checkbox Response Separation Analysis")
print("Explains why checkbox responses are separated into binary columns\n")

has_checkbox = analyze_checkbox_responses(df)
if has_checkbox:
    demonstrate_separation_benefits(final_df)

print(f"\nConclusion:")
print(f"Separating checkbox responses is essential for:")
print(f"1. Quantitative analysis of individual choices")
print(f"2. Machine learning algorithm compatibility") 
print(f"3. Correlation analysis between specific choices and user traits")
print(f"4. Building targeted prediction models for specific behaviors")
print(f"5. Statistical testing and hypothesis validation")

print(f"\nWithout separation, checkbox data remains as text strings")
print(f"that are difficult to analyze statistically or use predictively.")

Checkbox Response Separation Analysis
Explains why checkbox responses are separated into binary columns

Checkbox Response Handling Analysis
Found 8 checkbox-style columns:
  1. Which genres do you find yourself stopping more often before finishing? (Select all that apply)
  2. How do you usually discover movies you decide to watch? (Select all that apply)
  3. Why do you usually pause the movie? (Select all that apply)
  4. Do you usually do other things while watching movies? (Select all that apply)
  5. In general, what are the main reasons you stop watching movies before finishing? (Select all that apply)
  6. Where do you usually watch movies? (Select all that apply)
  7. Why do you usually choose to watch movies? (Select all that apply)
  8. What was the main reason you stopped watching these movies? Select all that apply.

Example responses from 'Which genres do you find yourself stopping more often before finishing? (Select all that apply)':
  1. Romance, Horror, Drama
  2. Rom

  c /= stddev[:, None]
  c /= stddev[None, :]
