In [20]:
import pandas as pd

# Load the datasets
rating_csv = "/workspaces/codespaces-jupyter/data/rating.csv"
rating_df = pd.read_csv(rating_csv).sample(1000, random_state=955)

movie_csv = "/workspaces/codespaces-jupyter/data/movie.csv"
movie_df = pd.read_csv(movie_csv)

# Merge the datasets
final_df = pd.merge(movie_df, rating_df, on='movieId', how='inner')

pd.set_option('display.max_columns', None)
df = final_df.drop(columns=['title', 'timestamp'])

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert pipe-separated genres into space-separated strings
df['genres'] = df['genres'].str.replace('|', ' ', regex=False)

# TF-IDF transformation
tfidf = TfidfVectorizer()
tfidf_genres = tfidf.fit_transform(df['genres'])
tfidf_genres_df = pd.DataFrame(
    tfidf_genres.toarray(), 
    columns=[f"tfidf_{genre}" for genre in tfidf.get_feature_names_out()],
    index=df.index
)

# Drop and replace
df = df.drop(columns=['genres'])
df = pd.concat([df, tfidf_genres_df], axis=1)

# Target encoding: Replace userId/movieId with their average rating
user_means = df.groupby('userId')['rating'].mean()
movie_means = df.groupby('movieId')['rating'].mean()

df['userId_encoded'] = df['userId'].map(user_means)
df['movieId_encoded'] = df['movieId'].map(movie_means)

# Drop the original IDs
df = df.drop(columns=['userId', 'movieId'])

import random
import numpy as np

# Store the target variable separately
y_full = df['rating']

# Create an imputation dataframe by dropping the dependent variable
imputation_df = df.drop('rating', axis=1)

# Set a random seed for reproducibility
random.seed(29)
np.random.seed(29)

# Calculate the total number of values in the dataframe
total_values = imputation_df.size

# Function to create a dataframe with a specified percentage of missing values
def create_missing_df(base_df, percent_missing):
    df_missing = base_df.copy()
    num_nulls = int(total_values * percent_missing)
    indices = [(row, col) for row in range(df_missing.shape[0]) for col in range(df_missing.shape[1])]
    random_indices = random.sample(indices, num_nulls)
    for row, col in random_indices:
        df_missing.iat[row, col] = np.nan
    return df_missing

# Create dataframes with different levels of missingness
df_5 = create_missing_df(imputation_df, 0.05)
df_10 = create_missing_df(imputation_df, 0.10)
df_15 = create_missing_df(imputation_df, 0.15)
df_20 = create_missing_df(imputation_df, 0.20)

from sklearn.impute import SimpleImputer, KNNImputer
from fancyimpute import IterativeImputer  # for multivariate regression

# Put all missing dataframes into a dictionary for easier looping
missing_dfs = {
    '5%': df_5,
    '10%': df_10,
    '15%': df_15,
    '20%': df_20
}

# Store results
imputed_dfs = {}

for key, df in missing_dfs.items():
    print(f"\nProcessing {key} missing data...")

    # 1. Case-wise deletion
    imputed_dfs[f'{key}_casewise'] = df.dropna()

    # 2. Fill with 0
    imputed_dfs[f'{key}_zero'] = df.fillna(0)

    # 3. Fill with mean
    mean_imputer = SimpleImputer(strategy='mean')
    imputed_dfs[f'{key}_mean'] = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)

    # 4. KNN imputation (using 5 neighbors)
    knn_imputer = KNNImputer(n_neighbors=5)
    imputed_dfs[f'{key}_knn'] = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

    # 5. Multivariate regression imputation (IterativeImputer is MICE-based)
    mice_imputer = IterativeImputer(random_state=42)
    imputed_dfs[f'{key}_regression'] = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Store results
regression_results = {}

# Loop through each imputed DataFrame
for name, imputed_df in imputed_dfs.items():
    # Copy the features
    X = imputed_df.copy()

    # Reindex the target variable to match X's index
    y = y_full.reindex(X.index)

    # Drop any rows where y is NaN (can happen after case-wise deletion)
    valid_idx = y.notnull()
    X = X[valid_idx].dropna()
    y = y.loc[X.index]

    # Define cross-validator
    kf = KFold(n_splits=5, shuffle=True, random_state=7)

    # Fit and predict with cross-validation
    model = RandomForestRegressor(n_estimators=100, random_state=72)
    y_pred = cross_val_predict(model, X, y, cv=kf)

    # Evaluate
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)

    # Store metrics
    regression_results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    }

# Display results
results_5 = {method: results for method, results in regression_results.items() if method.startswith('5%')}
results_10 = {method: results for method, results in regression_results.items() if method.startswith('10%')}
results_15 = {method: results for method, results in regression_results.items() if method.startswith('15%')}
results_20 = {method: results for method, results in regression_results.items() if method.startswith('20%')}

# Convert the dictionaries into DataFrames
df_5_results = pd.DataFrame(results_5).T
df_10_results = pd.DataFrame(results_10).T
df_15_results = pd.DataFrame(results_15).T
df_20_results = pd.DataFrame(results_20).T

# Display the results
print("\nRegression Results for 5% Missing Data:")
print(df_5_results[['MAE', 'MSE', 'RMSE']].round(4))

print("\nRegression Results for 10% Missing Data:")
print(df_10_results[['MAE', 'MSE', 'RMSE']].round(4))

print("\nRegression Results for 15% Missing Data:")
print(df_15_results[['MAE', 'MSE', 'RMSE']].round(4))

print("\nRegression Results for 20% Missing Data:")
print(df_20_results[['MAE', 'MSE', 'RMSE']].round(4))


Processing 5% missing data...

Processing 10% missing data...





Processing 15% missing data...





Processing 20% missing data...





Regression Results for 5% Missing Data:
                  MAE     MSE    RMSE
5%_casewise    0.0382  0.0211  0.1453
5%_zero        0.0492  0.0347  0.1862
5%_mean        0.0407  0.0249  0.1577
5%_knn         0.0426  0.0224  0.1495
5%_regression  0.0394  0.0225  0.1500

Regression Results for 10% Missing Data:
                   MAE     MSE    RMSE
10%_casewise    0.0829  0.0358  0.1892
10%_zero        0.0846  0.0704  0.2653
10%_mean        0.0719  0.0448  0.2117
10%_knn         0.0852  0.0664  0.2578
10%_regression  0.0603  0.0436  0.2088

Regression Results for 15% Missing Data:
                   MAE     MSE    RMSE
15%_casewise    0.2830  0.2453  0.4953
15%_zero        0.1199  0.1289  0.3590
15%_mean        0.1110  0.0954  0.3089
15%_knn         0.1446  0.1249  0.3535
15%_regression  0.1166  0.1031  0.3212

Regression Results for 20% Missing Data:
                   MAE     MSE    RMSE
20%_casewise    0.5340  0.5302  0.7281
20%_zero        0.1311  0.1372  0.3705
20%_mean        0.13