## Step 1: Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Load the IMDB dataset
df = pd.read_csv('imdb_data.csv')
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

## Step 2: Initial Exploration

In [None]:
# Display basic info
print("=== DATASET INFO ===")
print(df.info())
print("\n=== FIRST 5 ROWS ===")
print(df.head())
print("\n=== MISSING VALUES ===")
print(df.isnull().sum())
print("\n=== DUPLICATE ROWS ===")
print(f"Total duplicate rows: {df.duplicated().sum()}")
print("\n=== DUPLICATE MOVIE TITLES ===")
print(f"Duplicate movie titles: {df.duplicated(subset=['movie_title']).sum()}")
print("\n=== BASIC STATISTICS ===")
print(df.describe())

## Step 3: Data Cleaning

#### Handle Missing Values

In [None]:
# Check the percentage of missing values for each column
print("=== MISSING VALUES PERCENTAGE ===")
missing_percent = (df.isnull().sum() / len(df)) * 100
print(missing_percent.sort_values(ascending=False))

# Handle missing values based on column type and importance
print("\n=== HANDLING MISSING VALUES ===")

# For numerical columns - fill with median
numerical_cols = ['imdb_score', 'title_year', 'duration']
for col in numerical_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        print(f"Filled {col} missing values with median: {median_val}")

# For categorical columns - fill with mode or 'Unknown'
categorical_cols = ['content_rating', 'main_genre', 'director_name', 'star_cast']
for col in categorical_cols:
    if df[col].isnull().sum() > 0:
        if col in ['content_rating', 'main_genre']:
            mode_val = df[col].mode()[0] if not df[col].mode().empty else 'Unknown'
            df[col] = df[col].fillna(mode_val)
            print(f"Filled {col} missing values with mode: {mode_val}")
        else:
            df[col] = df[col].fillna('Unknown')
            print(f"Filled {col} missing values with 'Unknown'")

# For movie_title - drop rows with missing titles (essential field)
if df['movie_title'].isnull().sum() > 0:
    initial_count = len(df)
    df = df.dropna(subset=['movie_title'])
    print(f"Dropped {initial_count - len(df)} rows with missing movie titles")

print(f"\nMissing values after cleaning: {df.isnull().sum().sum()}")

#### Clean Specific Columns

In [None]:
# Clean movie_title column (remove trailing spaces and special characters)
print("=== CLEANING MOVIE TITLES ===")
df['movie_title'] = df['movie_title'].str.strip()
# Remove special characters that might cause issues but keep the actual title readable
df['movie_title'] = df['movie_title'].str.replace(r'[^\w\s\-\:\.\,\!\?\(\)]', '', regex=True)
print(f"Cleaned movie titles")

# Clean and standardize content_rating
print("\n=== CLEANING CONTENT RATING ===")
print("Original content ratings:")
print(df['content_rating'].value_counts())
# Standardize content ratings
df['content_rating'] = df['content_rating'].str.upper().str.strip()
print("\nStandardized content ratings:")
print(df['content_rating'].value_counts())

# Clean director_name and star_cast
print("\n=== CLEANING DIRECTOR AND CAST ===")
df['director_name'] = df['director_name'].str.strip()
df['star_cast'] = df['star_cast'].str.strip()

# Clean genres column - we'll separate this into individual genre columns later
print("\n=== CLEANING GENRES ===")
df['genres'] = df['genres'].str.strip()
print(f"Sample genres: {df['genres'].head()}")

print("\nColumn cleaning completed!")

#### Handle Duplicates

In [None]:
# Check for duplicate movie titles
print("=== HANDLING DUPLICATES ===")
print(f"Total rows before duplicate removal: {len(df)}")
print(f"Duplicate movie titles: {df.duplicated(subset=['movie_title']).sum()}")

# Show some examples of duplicate titles
duplicates = df[df.duplicated(subset=['movie_title'], keep=False)].sort_values('movie_title')
if len(duplicates) > 0:
    print("\nSample duplicate titles:")
    print(duplicates[['movie_title', 'title_year', 'imdb_score']].head(10))

# Drop duplicates keeping the first occurrence
df = df.drop_duplicates(subset=['movie_title'], keep='first')
print(f"\nRows after duplicate removal: {len(df)}")
print(f"Removed {len(duplicates) - len(df[df.duplicated(subset=['movie_title'], keep=False)])} duplicate rows")

# Check for complete duplicate rows
complete_duplicates = df.duplicated().sum()
if complete_duplicates > 0:
    print(f"\nComplete duplicate rows found: {complete_duplicates}")
    df = df.drop_duplicates()
    print(f"Rows after removing complete duplicates: {len(df)}")
else:
    print("\nNo complete duplicate rows found")

#### Convert Data Types

In [None]:
# Convert data types appropriately
print("=== CONVERTING DATA TYPES ===")

# Ensure imdb_score is numeric
df['imdb_score'] = pd.to_numeric(df['imdb_score'], errors='coerce')
print(f"imdb_score converted to numeric, null values: {df['imdb_score'].isnull().sum()}")

# Ensure title_year is integer
df['title_year'] = pd.to_numeric(df['title_year'], errors='coerce').astype('Int64')
print(f"title_year converted to integer, null values: {df['title_year'].isnull().sum()}")

# Ensure duration is numeric
df['duration'] = pd.to_numeric(df['duration'], errors='coerce')
print(f"duration converted to numeric, null values: {df['duration'].isnull().sum()}")

# Check data types after conversion
print("\n=== DATA TYPES AFTER CONVERSION ===")
print(df.dtypes)

# Check for any remaining null values created during conversion
print("\n=== NULL VALUES AFTER TYPE CONVERSION ===")
print(df.isnull().sum())

# Fill any new null values created during conversion
if df['imdb_score'].isnull().sum() > 0:
    df['imdb_score'] = df['imdb_score'].fillna(df['imdb_score'].median())
if df['title_year'].isnull().sum() > 0:
    df['title_year'] = df['title_year'].fillna(df['title_year'].median())
if df['duration'].isnull().sum() > 0:
    df['duration'] = df['duration'].fillna(df['duration'].median())

print("Data type conversion completed!")

#### Create New Features

In [None]:
# Create new features for better analysis
print("=== CREATING NEW FEATURES ===")

# Separate genres into individual columns
print("\nSeparating genres into individual columns...")
# Split genres by comma and strip whitespace
genre_split = df['genres'].str.split(',').apply(lambda x: [genre.strip() for genre in x] if x else [])

# Find the maximum number of genres any movie has
max_genres = max(len(genres) for genres in genre_split)
print(f"Maximum number of genres for any movie: {max_genres}")

# Create individual genre columns
for i in range(max_genres):
    col_name = f"genre_{i+1}"
    df[col_name] = genre_split.apply(lambda x: x[i] if i < len(x) else None)
    print(f"Created {col_name}")

# Create a genre count feature
df['genre_count'] = genre_split.apply(len)
print(f"Created genre_count feature")

# Create decade feature from title_year
df['decade'] = (df['title_year'] // 10) * 10
print(f"Created decade feature")

# Create IMDB score categories
df['imdb_category'] = pd.cut(df['imdb_score'], 
                           bins=[0, 5, 7, 8, 10], 
                           labels=['Poor', 'Average', 'Good', 'Excellent'])
print(f"Created imdb_category feature")

# Create duration categories
df['duration_category'] = pd.cut(df['duration'],
                                bins=[0, 90, 120, 180, float('inf')],
                                labels=['Short', 'Medium', 'Long', 'Very Long'])
print(f"Created duration_category feature")

# Show the first few rows with new features
print("\n=== SAMPLE OF NEW FEATURES ===")
new_feature_cols = ['genre_1', 'genre_2', 'genre_3', 'genre_count', 'decade', 'imdb_category', 'duration_category']
print(df[['movie_title'] + new_feature_cols].head())

# Show genre distribution
print("\n=== GENRE DISTRIBUTION ===")
print("Top 10 first genres:")
print(df['genre_1'].value_counts().head(10))

print("\nFeature creation completed!")

## Step 4: Outlier Detection and Handling

In [None]:
# Visualize numerical columns for outliers
print("=== OUTLIER DETECTION AND HANDLING ===")

# Check the distribution of numerical columns
numerical_cols = ['imdb_score', 'title_year', 'duration']
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

for i, col in enumerate(numerical_cols):
    df[col].hist(ax=axes[i], bins=30, alpha=0.7)
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Create boxplots to visualize outliers
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for i, col in enumerate(numerical_cols):
    df.boxplot(column=col, ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')

plt.tight_layout()
plt.show()

# Identify outliers using IQR method
print("\n=== OUTLIER STATISTICS ===")
outlier_info = {}
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    outlier_info[col] = {
        'count': len(outliers),
        'percentage': (len(outliers) / len(df)) * 100,
        'lower_bound': lower_bound,
        'upper_bound': upper_bound
    }
    
    print(f"{col}:")
    print(f"  Outliers: {len(outliers)} ({(len(outliers)/len(df)*100):.2f}%)")
    print(f"  Lower bound: {lower_bound:.2f}")
    print(f"  Upper bound: {upper_bound:.2f}")
    print(f"  Min value: {df[col].min():.2f}")
    print(f"  Max value: {df[col].max():.2f}")
    print()

# Handle outliers for duration only (as IMDB score and year shouldn't be capped)
print("=== HANDLING OUTLIERS ===")
duration_outliers_before = len(df[(df['duration'] < outlier_info['duration']['lower_bound']) | 
                                 (df['duration'] > outlier_info['duration']['upper_bound'])])

# Cap duration outliers
df['duration'] = np.where(df['duration'] < outlier_info['duration']['lower_bound'], 
                         outlier_info['duration']['lower_bound'], df['duration'])
df['duration'] = np.where(df['duration'] > outlier_info['duration']['upper_bound'], 
                         outlier_info['duration']['upper_bound'], df['duration'])

print(f"Capped {duration_outliers_before} duration outliers")
print("Note: IMDB score and year outliers were kept as they represent valid extreme values")

print("\nOutlier handling completed!")

## Step 5: Final Checks and Save Clean Data

In [None]:
# Final data quality checks and summary
print("=== FINAL DATA QUALITY CHECKS ===")

# Check cleaned data info
print("Dataset info after cleaning:")
print(df.info())

# Check for any remaining missing values
print(f"\nMissing values after cleaning: {df.isnull().sum().sum()}")
if df.isnull().sum().sum() > 0:
    print("Missing values by column:")
    print(df.isnull().sum()[df.isnull().sum() > 0])

# Summary statistics
print("\n=== SUMMARY STATISTICS ===")
print(df.describe())

# Show sample of cleaned data
print("\n=== SAMPLE OF CLEANED DATA ===")
print(df.head())

# Data quality metrics
print("\n=== DATA QUALITY METRICS ===")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Movies per decade:")
print(df['decade'].value_counts().sort_index())

print(f"\nContent rating distribution:")
print(df['content_rating'].value_counts())

print(f"\nTop 10 genres:")
print(df['genre_1'].value_counts().head(10))

print(f"\nIMDB score distribution:")
print(df['imdb_category'].value_counts())

# Save cleaned data
output_filename = 'cleaned_imdb_data.csv'
df.to_csv(output_filename, index=False)
print(f"\n=== DATA SAVED ===")
print(f"Cleaned data saved to: {output_filename}")
print(f"Shape of saved data: {df.shape}")

# Show final column list
print(f"\nFinal columns in cleaned dataset:")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print("\n✅ Data cleaning completed successfully!")

In [None]:
# Display final dataset structure and data dictionary
print("=== FINAL DATASET STRUCTURE ===")
print(f"Dataset shape: {df.shape}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print("\n=== DATA DICTIONARY ===")
data_dict = {
    'movie_title': 'Movie title (cleaned)',
    'imdb_score': 'IMDB rating score (0-10)',
    'title_year': 'Year the movie was released',
    'content_rating': 'Content rating (G, PG, PG-13, R, etc.)',
    'main_genre': 'Primary genre of the movie',
    'director_name': 'Director name',
    'star_cast': 'Main cast members',
    'genres': 'Original genres string (comma-separated)',
    'duration': 'Movie duration in minutes',
    'genre_1': 'First genre',
    'genre_2': 'Second genre (if applicable)',
    'genre_3': 'Third genre (if applicable)',
    'genre_count': 'Number of genres assigned to the movie',
    'decade': 'Decade the movie was released',
    'imdb_category': 'IMDB score category (Poor/Average/Good/Excellent)',
    'duration_category': 'Duration category (Short/Medium/Long/Very Long)'
}

for col in df.columns:
    if col in data_dict:
        print(f"{col:18} - {data_dict[col]}")
    else:
        # For dynamically created genre columns
        if col.startswith('genre_') and col != 'genre_count':
            print(f"{col:18} - {col.replace('_', ' ').title()}")

print("\n=== READY FOR NEXT STEPS ===")
print("✅ Data preprocessing completed")
print("✅ Duplicates removed")
print("✅ Missing values handled")
print("✅ Genres separated into individual columns")
print("✅ New features created")
print("✅ Outliers handled")
print("✅ Data types standardized")
print("\nThe cleaned dataset is now ready for feature engineering and modeling!")