In [None]:
import pandas as pd

# Load the datasets
rating_csv = "/workspaces/codespaces-jupyter/data/rating.csv"
rating_df = pd.read_csv(rating_csv).sample(385, random_state=955)

movie_csv = "/workspaces/codespaces-jupyter/data/movie.csv"
movie_df = pd.read_csv(movie_csv)

# Merge the datasets
final_df = pd.merge(movie_df, rating_df, on='movieId', how='inner')

pd.set_option('display.max_columns', None)
df = final_df.drop(columns=['title', 'timestamp'])

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert pipe-separated genres into space-separated strings
df['genres'] = df['genres'].str.replace('|', ' ', regex=False)

# TF-IDF transformation
tfidf = TfidfVectorizer()
tfidf_genres = tfidf.fit_transform(df['genres'])
tfidf_genres_df = pd.DataFrame(
    tfidf_genres.toarray(), 
    columns=[f"tfidf_{genre}" for genre in tfidf.get_feature_names_out()],
    index=df.index
)

# Drop and replace
df = df.drop(columns=['genres'])
df = pd.concat([df, tfidf_genres_df], axis=1)

import random
import numpy as np

# Store the target variable separately
y_full = df['rating']

# Create an imputation dataframe by dropping the dependent variable
imputation_df = df.drop('rating', axis=1)

# Set a random seed for reproducibility
random.seed(29)
np.random.seed(29)

# Calculate the total number of values in the dataframe
total_values = imputation_df.size

# Function to create a dataframe with a specified percentage of missing values
def create_missing_df(base_df, percent_missing):
    df_missing = base_df.copy()
    num_nulls = int(total_values * percent_missing)
    indices = [(row, col) for row in range(df_missing.shape[0]) for col in range(df_missing.shape[1])]
    random_indices = random.sample(indices, num_nulls)
    for row, col in random_indices:
        df_missing.iat[row, col] = np.nan
    return df_missing

# Create dataframes with different levels of missingness
df_5 = create_missing_df(imputation_df, 0.05)
df_10 = create_missing_df(imputation_df, 0.10)
df_15 = create_missing_df(imputation_df, 0.15)

from sklearn.impute import SimpleImputer, KNNImputer
from fancyimpute import IterativeImputer  # for multivariate regression

# Put all missing dataframes into a dictionary for easier looping
missing_dfs = {
    '5%': df_5,
    '10%': df_10,
    '15%': df_15
}

# Store results
imputed_dfs = {}

for key, df in missing_dfs.items():
    print(f"\nProcessing {key} missing data...")

    # 1. Case-wise deletion
    imputed_dfs[f'{key}_casewise'] = df.dropna()

    # 2. Fill with 0
    imputed_dfs[f'{key}_zero'] = df.fillna(0)

    # 3. Fill with mean
    mean_imputer = SimpleImputer(strategy='mean')
    imputed_dfs[f'{key}_mean'] = pd.DataFrame(mean_imputer.fit_transform(df), columns=df.columns)

    # 4. KNN imputation (using 5 neighbors)
    knn_imputer = KNNImputer(n_neighbors=3)
    imputed_dfs[f'{key}_knn'] = pd.DataFrame(knn_imputer.fit_transform(df), columns=df.columns)

    # 5. Multivariate regression imputation (IterativeImputer is MICE-based)
    mice_imputer = IterativeImputer(max_iter=5, random_state=32)
    imputed_dfs[f'{key}_regression'] = pd.DataFrame(mice_imputer.fit_transform(df), columns=df.columns)

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import category_encoders as ce

regression_results = {}

for name, imputed_df in imputed_dfs.items():
    print(f"\nRunning model for {name} imputed data...")
    X = imputed_df.copy()
    y = y_full.reindex(X.index)

    valid_idx = y.notnull()
    X = X[valid_idx].dropna()
    y = y.loc[X.index]

    kf = KFold(n_splits=5, shuffle=True, random_state=7)
    y_pred = np.zeros(len(X))

    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
        y_train = y.iloc[train_idx]

        # Target encoding within the fold
        encoder = ce.TargetEncoder(cols=['userId', 'movieId'], smoothing=5.0)
        encoder.fit(X_train[['userId', 'movieId']], y_train)

        X_train_encoded = encoder.transform(X_train[['userId', 'movieId']])
        X_val_encoded = encoder.transform(X_val[['userId', 'movieId']])

        # Replace original IDs with encoded
        X_train.update(X_train_encoded)
        X_val.update(X_val_encoded)

        # Train model
        model = GradientBoostingRegressor(
            n_estimators=150, 
            learning_rate=0.1, 
            max_depth=4, 
            random_state=72
        )
        model.fit(X_train, y_train)
        y_pred[val_idx] = model.predict(X_val)

    # Evaluate
    mae = mean_absolute_error(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)

    regression_results[name] = {
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse
    }

# Display results
results_5 = {method: results for method, results in regression_results.items() if method.startswith('5%')}
results_10 = {method: results for method, results in regression_results.items() if method.startswith('10%')}
results_15 = {method: results for method, results in regression_results.items() if method.startswith('15%')}

# Convert the dictionaries into DataFrames
df_5_results = pd.DataFrame(results_5).T
df_10_results = pd.DataFrame(results_10).T
df_15_results = pd.DataFrame(results_15).T

# Display the results
print("\nRegression Results for 5% Missing Data:")
print(df_5_results[['MAE', 'MSE', 'RMSE']].round(4))

print("\nRegression Results for 10% Missing Data:")
print(df_10_results[['MAE', 'MSE', 'RMSE']].round(4))

print("\nRegression Results for 15% Missing Data:")
print(df_15_results[['MAE', 'MSE', 'RMSE']].round(4))


Processing 5% missing data...

Processing 10% missing data...

Processing 15% missing data...





Running model for 5%_casewise imputed data...

Running model for 5%_zero imputed data...

Running model for 5%_mean imputed data...

Running model for 5%_knn imputed data...

Running model for 5%_regression imputed data...

Running model for 10%_casewise imputed data...

Running model for 10%_zero imputed data...

Running model for 10%_mean imputed data...

Running model for 10%_knn imputed data...

Running model for 10%_regression imputed data...

Running model for 15%_casewise imputed data...

Running model for 15%_zero imputed data...

Running model for 15%_mean imputed data...

Running model for 15%_knn imputed data...

Running model for 15%_regression imputed data...

Regression Results for 5% Missing Data:
                  MAE     MSE    RMSE
5%_casewise    0.7007  0.8853  0.9409
5%_zero        0.8046  1.0983  1.0480
5%_mean        0.8034  1.0943  1.0461
5%_knn         0.8031  1.0972  1.0475
5%_regression  0.7989  1.0918  1.0449

Regression Results for 10% Missing Data:
       