In [1]:

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from tqdm import tqdm
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import lightgbm as lgb
import joblib
import warnings
import gc
from scipy.sparse import csr_matrix
from sklearn.utils.extmath import randomized_svd
from joblib import Parallel, delayed
import pickle


os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
warnings.filterwarnings('ignore')

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

def load_data(file_paths):
    print("Loading datasets...")
    dfs = {}
    for name, path in file_paths.items():
        dfs[name] = pd.read_csv(path, low_memory=False)
    return dfs

def preprocess_data(dfs):
    print("Preprocessing data...")
    movies_df = dfs['movies']
    keywords_df = dfs['keywords']
    credits_df = dfs['credits']
    ratings_df = dfs['ratings']

    # Rename 'id' to 'movieId' in movies_df
    movies_df = movies_df.rename(columns={'id': 'movieId'})

    # Merge datasets
    for df in [keywords_df, credits_df]:
        df['id'] = df['id'].astype(str)
    movies_df['movieId'] = movies_df['movieId'].astype(str)

    merged_df = movies_df.merge(keywords_df, left_on='movieId', right_on='id', how='left')
    merged_df = merged_df.merge(credits_df, left_on='movieId', right_on='id', how='left')

    # Clean and preprocess text data
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    def clean_text(text):
        if isinstance(text, str):
            text = re.sub(r'[^\w\s]', '', text.lower())
            tokens = word_tokenize(text)
            return ' '.join([lemmatizer.lemmatize(word) for word in tokens if word not in stop_words])
        return ''

    for column in ['overview', 'keywords']:
        tqdm.pandas(desc=f"Cleaning {column}")
        merged_df[f'cleaned_{column}'] = merged_df[column].progress_apply(clean_text)

    # Process other columns
    merged_df['release_date'] = pd.to_datetime(merged_df['release_date'], errors='coerce')
    merged_df['release_year'] = merged_df['release_date'].dt.year

    merged_df['genres'] = merged_df['genres'].fillna('[]').apply(eval).apply(lambda x: [i['name'] for i in x] if x else [])
    merged_df['genres_str'] = merged_df['genres'].apply(lambda x: ' '.join(x))

    merged_df['cast'] = merged_df['cast'].fillna('[]').apply(eval).apply(lambda x: [i['name'] for i in x[:5]] if x else [])
    merged_df['cast_str'] = merged_df['cast'].apply(lambda x: ' '.join(x))

    merged_df['crew'] = merged_df['crew'].fillna('[]').apply(eval)
    merged_df['director'] = merged_df['crew'].apply(lambda x: ' '.join([i['name'] for i in x if i['job'] == 'Director']))

    merged_df['combined_features'] = (
        merged_df['cleaned_overview'] + ' ' +
        merged_df['cleaned_keywords'] + ' ' +
        merged_df['genres_str'] + ' ' +
        merged_df['cast_str'] + ' ' +
        merged_df['director']
    )

 

    return merged_df

def engineer_features(df):
    print("Engineering features...")
    # Ensure that numerical columns are numeric
    for col in ['popularity', 'vote_average', 'vote_count', 'release_year']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Fill NaNs with median values
    for col in ['popularity', 'vote_average', 'vote_count', 'release_year']:
        df[col].fillna(df[col].median(), inplace=True)

    # Normalize numerical features
    scaler = MinMaxScaler()
    df[['popularity', 'vote_average', 'vote_count', 'release_year']] = scaler.fit_transform(
        df[['popularity', 'vote_average', 'vote_count', 'release_year']]
    )

    # Encode categorical features
    le = LabelEncoder()
    for col in ['adult', 'status', 'original_language']:
        df[col] = le.fit_transform(df[col].astype(str))

    # Create genre one-hot encoding
    genres = set()
    for genre_list in df['genres']:
        genres.update(genre_list)
    for genre in genres:
        df[f'genre_{genre}'] = df['genres'].apply(lambda x: 1 if genre in x else 0)

    return df

class DistilBERTEncoder:
    def __init__(self):
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)

    def encode(self, texts, batch_size=32):
        encodings = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding texts"):
            batch = texts[i:i+batch_size]
            inputs = self.tokenizer(batch, padding=True, truncation=True, max_length=128, return_tensors='pt')
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                outputs = self.model(**inputs)
            encodings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        return np.vstack(encodings)

def content_based_recommendations(movie_title, df, encodings, top_n=10):
    # Normalize the input movie title
    movie_title = movie_title.strip().lower()

    # Normalize the titles in the dataframe
    df['normalized_title'] = df['title'].str.strip().str.lower()

    movie_index = df[df['normalized_title'] == movie_title].index
    if len(movie_index) == 0:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []
    idx = movie_index[0]
    movie_encoding = encodings[idx].reshape(1, -1)
    cosine_sim = cosine_similarity(movie_encoding, encodings).flatten()
    sim_scores = list(enumerate(cosine_sim))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices].tolist()

class NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=64, layers=[ 128, 64, 32]):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

        self.fc_layers = nn.ModuleList()
        input_size = embedding_size * 2
        for layer_size in layers:
            self.fc_layers.append(nn.Linear(input_size, layer_size))
            self.fc_layers.append(nn.ReLU())
            self.fc_layers.append(nn.BatchNorm1d(layer_size))
            self.fc_layers.append(nn.Dropout(0.2))
            input_size = layer_size

        self.output_layer = nn.Linear(layers[-1], 1)

    def forward(self, user_input, movie_input):
        user_embedded = self.user_embedding(user_input)
        movie_embedded = self.movie_embedding(movie_input)

        vector = torch.cat([user_embedded, movie_embedded], dim=-1)
        for layer in self.fc_layers:
            vector = layer(vector)

        output = self.output_layer(vector)
        return output.squeeze()

class MovieRatingDataset(Dataset):
    def __init__(self, user_ids, movie_ids, ratings):
        self.user_ids = user_ids
        self.movie_ids = movie_ids
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.user_ids[idx], self.movie_ids[idx], self.ratings[idx]

def train_ncf_model(ratings_df, epochs=25, batch_size=1024):
    print("Training NCF model...")
    user_encoder = LabelEncoder()
    movie_encoder = LabelEncoder()

    user_ids = user_encoder.fit_transform(ratings_df['userId'])
    movie_ids = movie_encoder.fit_transform(ratings_df['movieId'])
    ratings = ratings_df['rating'].values

    train_user_ids, val_user_ids, train_movie_ids, val_movie_ids, train_ratings, val_ratings = train_test_split(
        user_ids, movie_ids, ratings, test_size=0.2, random_state=42)

    train_dataset = MovieRatingDataset(train_user_ids, train_movie_ids, train_ratings)
    val_dataset = MovieRatingDataset(val_user_ids, val_movie_ids, val_ratings)
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

    num_users = len(user_encoder.classes_)
    num_movies = len(movie_encoder.classes_)
    model = NCF(num_users, num_movies)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.5)

    best_val_loss = float('inf')
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for user_input, movie_input, rating in train_dataloader:
            user_input, movie_input, rating = user_input.to(device), movie_input.to(device), rating.float().to(device)

            optimizer.zero_grad()
            output = model(user_input, movie_input)
            loss = criterion(output, rating)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for user_input, movie_input, rating in val_dataloader:
                user_input, movie_input, rating = user_input.to(device), movie_input.to(device), rating.float().to(device)
                output = model(user_input, movie_input)
                loss = criterion(output, rating)
                val_loss += loss.item()

        train_loss = total_loss / len(train_dataloader)
        val_loss = val_loss / len(val_dataloader)
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_ncf_model.pth')

    model.load_state_dict(torch.load('best_ncf_model.pth'))
    return model, user_encoder, movie_encoder

def train_lightgbm_model(df):
    print("Training LightGBM model...")

    features = ['popularity', 'vote_average', 'vote_count', 'release_year', 'adult', 'status', 'original_language'] + \
               [col for col in df.columns if col.startswith('genre_')]
    X = df[features]
    y = df['vote_average']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_test, label=y_test)

    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.01,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1
    }

    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]

    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[valid_data],
        callbacks=callbacks
    )

    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"LightGBM Test RMSE: {rmse:.4f}")

    joblib.dump(model, 'lightgbm_model.joblib')
    return model


class MemoryEfficientSVDModel:
    def __init__(self, user_factors, movie_factors, user_encoder, movie_encoder):
        self.user_factors = user_factors
        self.movie_factors = movie_factors
        self.user_encoder = user_encoder
        self.movie_encoder = movie_encoder

    def predict(self, user, movie):
        try:
            user_idx = self.user_encoder.transform([str(user)])[0]
            movie_idx = self.movie_encoder.transform([str(movie)])[0]
            prediction = np.dot(self.user_factors[user_idx], self.movie_factors[movie_idx])
            return min(max(prediction, 0.5), 5)  # Clip prediction between 0.5 and 5
        except ValueError:
            return 2.5  # Return average rating if user or movie is not in the training set

    def save(self, path):
        np.save(f"{path}_user_factors.npy", self.user_factors)
        np.save(f"{path}_movie_factors.npy", self.movie_factors)
        joblib.dump(self.user_encoder, f"{path}_user_encoder.joblib")
        joblib.dump(self.movie_encoder, f"{path}_movie_encoder.joblib")

    @classmethod
    def load(cls, path):
        user_factors = np.load(f"{path}_user_factors.npy")
        movie_factors = np.load(f"{path}_movie_factors.npy")
        user_encoder = joblib.load(f"{path}_user_encoder.joblib")
        movie_encoder = joblib.load(f"{path}_movie_encoder.joblib")
        return cls(user_factors, movie_factors, user_encoder, movie_encoder)

def train_svd_model(ratings_df, n_factors=100, n_iter=20, batch_size=50000, n_jobs=-1):
    print("Training memory-efficient SVD model with parallel processing...")

    # Ensure movieId and userId are strings
    ratings_df['movieId'] = ratings_df['movieId'].astype(str)
    ratings_df['userId'] = ratings_df['userId'].astype(str)

    # Encode user and movie IDs
    user_encoder = LabelEncoder()
    movie_encoder = LabelEncoder()

    user_ids = user_encoder.fit_transform(ratings_df['userId'])
    movie_ids = movie_encoder.fit_transform(ratings_df['movieId'])

    # Convert to CSR matrix
    ratings_sparse = csr_matrix((ratings_df['rating'], (user_ids, movie_ids)))

    n_users, n_movies = ratings_sparse.shape

    # Initialize factor matrices
    user_factors = np.zeros((n_users, n_factors))
    movie_factors = np.zeros((n_movies, n_factors))

    def process_batch(start, end):
        batch = ratings_sparse[start:end, :]
        U, S, Vt = randomized_svd(batch, n_components=n_factors, n_iter=n_iter, random_state=42)
        return U, S, Vt

    # Process data in batches using parallel processing
    total_batches = (n_users + batch_size - 1) // batch_size
    results = Parallel(n_jobs=n_jobs)(
        delayed(process_batch)(start, min(start + batch_size, n_users))
        for start in tqdm(range(0, n_users, batch_size), total=total_batches, desc="SVD Training")
    )

    # Aggregate results
    for i, (U, S, Vt) in enumerate(results):
        start = i * batch_size
        end = min((i + 1) * batch_size, n_users)
        user_factors[start:end] = U * np.sqrt(S)
        movie_factors += Vt.T * np.sqrt(S)

    # Normalize factor matrices
    user_factors /= len(results)
    movie_factors /= len(results)

    model = MemoryEfficientSVDModel(user_factors, movie_factors, user_encoder, movie_encoder)

    return model


def generate_cf_recommendations(movie_id, df, ratings_df, svd_model, top_n=20, n_users=100, n_movies=5000):
    print(f"Generating CF recommendations for movie_id: {movie_id}")

    movie_id = str(movie_id)
    ratings_df['movieId'] = ratings_df['movieId'].astype(str)
    df['movieId'] = df['movieId'].astype(str)

    users_who_rated = ratings_df[ratings_df['movieId'] == movie_id]['userId'].unique()
    
    if len(users_who_rated) == 0:
        print(f"No users found who rated movie_id: {movie_id}")
        return []  # Return an empty list if no users rated this movie

    sampled_users = np.random.choice(users_who_rated, min(n_users, len(users_who_rated)), replace=False)
    print(f"Number of sampled users: {len(sampled_users)}")

    if len(sampled_users) == 0:
        print(f"No sampled users for movie_id: {movie_id}")
        return []  # Return an empty list if no users were sampled

    movie_counts = ratings_df['movieId'].value_counts()
    top_movies = movie_counts.nlargest(n_movies).index
    movies_rated_by_users = ratings_df[(ratings_df['userId'].isin(sampled_users)) & (ratings_df['movieId'].isin(top_movies))]['movieId'].unique()
    print(f"Number of considered movies: {len(movies_rated_by_users)}")

    if len(movies_rated_by_users) == 0:
        print(f"No movies found rated by sampled users for movie_id: {movie_id}")
        return []  # Return an empty list if no movies were found

    try:
        user_factors = np.array([svd_model.user_factors[svd_model.user_encoder.transform([user])[0]] for user in sampled_users])
        movie_factors = svd_model.movie_factors[svd_model.movie_encoder.transform(movies_rated_by_users)]

        predictions = np.dot(user_factors, movie_factors.T)
        avg_ratings = np.mean(predictions, axis=0)

        movie_avg_ratings = list(zip(movies_rated_by_users, avg_ratings))
        movie_avg_ratings.sort(key=lambda x: x[1], reverse=True)

        top_movie_ids = [movie for movie, _ in movie_avg_ratings[:top_n] if movie != movie_id]
        recommendations = df[df['movieId'].isin(top_movie_ids)]['title'].tolist()
        print(f"Number of recommendations generated: {len(recommendations)}")
        return recommendations
    except Exception as e:
        print(f"Error in CF recommendations: {str(e)}")
        return []  # Return an empty list if any error occurs

def generate_ncf_recommendations(movie_id, df, ncf_model, movie_encoder, top_n=20):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    ncf_model.eval()

    try:
        encoded_movie_id = movie_encoder.transform([movie_id])[0]
    except ValueError:
        return []

    movie_input = torch.tensor([encoded_movie_id]).to(device)

    similar_movies = []
    for other_movie in df['movieId'].unique():
        if other_movie != movie_id:
            try:
                encoded_other_movie = movie_encoder.transform([other_movie])[0]
            except ValueError:
                continue

            other_movie_input = torch.tensor([encoded_other_movie]).to(device)

            with torch.no_grad():
                similarity = torch.cosine_similarity(
                    ncf_model.movie_embedding(movie_input),
                    ncf_model.movie_embedding(other_movie_input)
                ).item()

            similar_movies.append((other_movie, similarity))

    similar_movies.sort(key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, _ in similar_movies[:top_n]]
    return df[df['movieId'].isin(top_movie_ids)]['title'].tolist()

def generate_lgbm_recommendations(movie_id, df, lgbm_model, top_n=20):
    features = ['popularity', 'vote_average', 'vote_count', 'release_year', 'adult', 'status', 'original_language'] + \
               [col for col in df.columns if col.startswith('genre_')]

    input_movie_features = df[df['movieId'] == movie_id][features].values

    similarities = []
    for idx, row in df.iterrows():
        if row['movieId'] != movie_id:
            other_movie_features = row[features].values
            similarity = cosine_similarity(input_movie_features, other_movie_features.reshape(1, -1))[0][0]
            similarities.append((row['movieId'], similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)
    top_movie_ids = [movie_id for movie_id, _ in similarities[:top_n]]
    return df[df['movieId'].isin(top_movie_ids)]['title'].tolist()

def hybrid_recommendations(movie_title, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model, top_n=10):
    movie_title = movie_title.strip().lower()
    df['normalized_title'] = df['title'].str.strip().str.lower()

    if movie_title not in df['normalized_title'].values:
        raise ValueError(f"Movie '{movie_title}' not found in the dataset.")

    movie_id = df[df['normalized_title'] == movie_title]['movieId'].iloc[0]

    content_recs = content_based_recommendations(movie_title, df, encodings, top_n=top_n*2)
    cf_recs = generate_cf_recommendations(movie_id, df, ratings_df, svd_model, top_n=top_n*2)
    ncf_recs = generate_ncf_recommendations(movie_id, df, ncf_model, movie_encoder, top_n=top_n*2)
    lgbm_recs = generate_lgbm_recommendations(movie_id, df, lgbm_model, top_n=top_n*2)

    print(f"Content-based recs: {len(content_recs)}")
    print(f"CF recs: {len(cf_recs)}")
    print(f"NCF recs: {len(ncf_recs)}")
    print(f"LightGBM recs: {len(lgbm_recs)}")

    # Get genre and director info for the input movie
    input_movie = df[df['normalized_title'] == movie_title].iloc[0]
    input_movie_genres = set(input_movie['genres'])
    
    input_movie_director = input_movie['director']

    hybrid_recs = []
    content_weight = 0.3
    cf_weight = 0.2
    ncf_weight = 0.3
    lgbm_weight = 0.2
    genre_weight = 0.05
    director_weight = 0.05

    all_movies = set(content_recs + cf_recs + ncf_recs + lgbm_recs)
    hybrid_scores = {}

    for movie in all_movies:
        if movie != movie_title:
            content_score = content_weight * (1 - content_recs.index(movie) / len(content_recs)) if movie in content_recs else 0
            cf_score = cf_weight * (1 - cf_recs.index(movie) / len(cf_recs)) if movie in cf_recs else 0
            ncf_score = ncf_weight * (1 - ncf_recs.index(movie) / len(ncf_recs)) if movie in ncf_recs else 0
            lgbm_score = lgbm_weight * (1 - lgbm_recs.index(movie) / len(lgbm_recs)) if movie in lgbm_recs else 0

            # Calculate genre similarity
            movie_genres = set(df[df['title'] == movie]['genres'].iloc[0])
            genre_similarity = len(input_movie_genres.intersection(movie_genres)) / len(input_movie_genres.union(movie_genres))
            genre_score = genre_weight * genre_similarity

            # Calculate director similarity
            movie_director = df[df['title'] == movie]['director'].iloc[0]
            director_score = director_weight if input_movie_director == movie_director else 0

            # Combine scores
            hybrid_scores[movie] = content_score + cf_score + ncf_score + lgbm_score + genre_score + director_score

    sorted_recs = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
    hybrid_recs = [movie for movie, score in sorted_recs[:top_n]]

    return hybrid_recs

def evaluate_model_contributions(movie_title, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model):
    # Normalize movie_title
    movie_title = movie_title.strip().lower()

    # Normalize titles in DataFrame for better matching
    df['normalized_title'] = df['title'].str.strip().str.lower()

    # Check if the movie exists in the DataFrame
    if movie_title not in df['normalized_title'].values:
        raise ValueError(f"Movie '{movie_title}' not found in the dataset.")

    # Get the movie_id for the given movie_title
    movie_id = df[df['normalized_title'] == movie_title]['movieId'].iloc[0]

    content_recs = content_based_recommendations(movie_title, df, encodings, top_n=10)
    cf_recs = generate_cf_recommendations(movie_id,df, ratings_df, svd_model, top_n=10)
    ncf_recs = generate_ncf_recommendations(movie_id, df, ncf_model, movie_encoder, top_n=10)
    lgbm_recs = generate_lgbm_recommendations(movie_id, df, lgbm_model, top_n=10)

    hybrid_recs = hybrid_recommendations(movie_title, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model)

    print(f"\nModel contributions for '{movie_title}':")
    print(f"Content-based recommendations: {', '.join(content_recs)}")
    print(f"Collaborative filtering (SVD) recommendations: {', '.join(cf_recs)}")
    print(f"Neural CF recommendations: {', '.join(ncf_recs)}")
    print(f"LightGBM recommendations: {', '.join(lgbm_recs)}")
    print(f"Hybrid recommendations: {', '.join(hybrid_recs)}")

    # Calculate overlap between hybrid and individual models
    content_overlap = len(set(hybrid_recs) & set(content_recs)) / len(hybrid_recs)
    cf_overlap = len(set(hybrid_recs) & set(cf_recs)) / len(hybrid_recs)
    ncf_overlap = len(set(hybrid_recs) & set(ncf_recs)) / len(hybrid_recs)
    lgbm_overlap = len(set(hybrid_recs) & set(lgbm_recs)) / len(hybrid_recs)

    print("\nModel contribution to hybrid recommendations:")
    print(f"Content-based: {content_overlap:.2f}")
    print(f"Collaborative filtering (SVD): {cf_overlap:.2f}")
    print(f"Neural CF: {ncf_overlap:.2f}")
    print(f"LightGBM: {lgbm_overlap:.2f}")

def main():
    file_paths = {
        'keywords': 'path\\keywords.csv',
        'credits': 'path\\credits.csv',
        'ratings': 'path\\ratings.csv',
        'movies': 'path\\movies_metadata.csv'
    }

    try:
        # Check if saved models and data exist
        if os.path.exists('preprocessed_df.pkl') and os.path.exists('ratings_df.pkl') and \
           os.path.exists('encodings.npy') and os.path.exists('ncf_model.pth') and \
           os.path.exists('user_encoder.joblib') and os.path.exists('movie_encoder.joblib') and \
           os.path.exists('lgbm_model.joblib') and os.path.exists('svd_model'):
            
            print("Loading saved models and data...")
            df = pd.read_pickle('preprocessed_df.pkl')
            ratings_df = pd.read_pickle('ratings_df.pkl')
            encodings = np.load('encodings.npy')
            
            ncf_model = NCF(len(np.unique(ratings_df['userId'])), len(np.unique(ratings_df['movieId'])))
            ncf_model.load_state_dict(torch.load('ncf_model.pth'))
            user_encoder = joblib.load('user_encoder.joblib')
            movie_encoder = joblib.load('movie_encoder.joblib')
            
            lgbm_model = joblib.load('lgbm_model.joblib')
            
            svd_model = MemoryEfficientSVDModel.load('svd_model')
        else:
            print("Processing data and training models...")
            dfs = load_data(file_paths)
            gc.collect()
            merged_df = preprocess_data(dfs)
            gc.collect()
            df = engineer_features(merged_df)
            gc.collect()

            print("Encoding movie features with DistilBERT...")
            distilbert_encoder = DistilBERTEncoder()
            gc.collect()
            encodings = distilbert_encoder.encode(df['combined_features'].tolist())

            ratings_df = dfs['ratings']
            gc.collect()

            ncf_model, user_encoder, movie_encoder = train_ncf_model(ratings_df)
            gc.collect()
            lgbm_model = train_lightgbm_model(df, ratings_df)
            gc.collect()
            svd_model = train_svd_model(ratings_df)
            gc.collect()

            # Save all models and data
            print("Saving models and data...")
            df.to_pickle('preprocessed_df.pkl')
            ratings_df.to_pickle('ratings_df.pkl')
            np.save('encodings.npy', encodings)
            
            torch.save(ncf_model.state_dict(), 'ncf_model.pth')
            joblib.dump(user_encoder, 'user_encoder.joblib')
            joblib.dump(movie_encoder, 'movie_encoder.joblib')
            
            joblib.dump(lgbm_model, 'lgbm_model.joblib')
            
            svd_model.save('svd_model')

        print("Models and data ready. Starting recommendation system...")

        test_movies = [
            "The Dark Knight",
            "Inception",
            "Pulp Fiction",
            "The Shawshank Redemption",
            "Forrest Gump"
        ]

        print("\nEvaluating model contributions:")
        for movie in test_movies:
            try:
                evaluate_model_contributions(movie, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model)
            except Exception as e:
                print(f"Error evaluating '{movie}': {str(e)}")

        print("\nGenerating recommendations:")
        for movie in tqdm(test_movies, desc="Evaluating"):
            try:
                recs = hybrid_recommendations(movie, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model)
                print(f"\nRecommendations for '{movie}':")
                for i, rec in enumerate(recs, 1):
                    print(f"{i}. {rec}")
            except Exception as e:
                print(f"Error generating recommendations for '{movie}': {str(e)}")

        while True:
            user_input = input("\nEnter a movie title (or 'quit' to exit): ").strip()
            if user_input.lower() == 'quit':
                break

            try:
                recs = hybrid_recommendations(user_input, df, ratings_df, encodings, ncf_model, movie_encoder, lgbm_model, svd_model)
                print(f"\nRecommendations for '{user_input}':")
                for i, rec in enumerate(recs, 1):
                    print(f"{i}. {rec}")
            except ValueError as e:
                print(f"Error: {str(e)}")
            except Exception as e:
                print(f"An unexpected error occurred: {str(e)}")

    except Exception as e:
        print(f"An error occurred: {str(e)}")

if __name__ == "__main__":
    main()


[nltk_data] Error loading punkt: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>
[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Error loading wordnet: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


Processing data and training models...
Loading datasets...
Preprocessing data...


Cleaning overview: 100%|██████████| 46632/46632 [00:15<00:00, 3082.20it/s]
Cleaning keywords: 100%|██████████| 46632/46632 [00:05<00:00, 8785.20it/s] 


Engineering features...
Encoding movie features with DistilBERT...


Encoding texts: 100%|██████████| 1458/1458 [07:17<00:00,  3.33it/s]


Training NCF model...
Epoch 1/25, Train Loss: 0.9687, Val Loss: 0.7480
Epoch 2/25, Train Loss: 0.7381, Val Loss: 0.7171
Epoch 3/25, Train Loss: 0.7136, Val Loss: 0.7009
Epoch 4/25, Train Loss: 0.7004, Val Loss: 0.6928
Epoch 5/25, Train Loss: 0.6941, Val Loss: 0.6879
Epoch 6/25, Train Loss: 0.6904, Val Loss: 0.6869
Epoch 7/25, Train Loss: 0.6881, Val Loss: 0.6839
Epoch 8/25, Train Loss: 0.6872, Val Loss: 0.6843
Epoch 9/25, Train Loss: 0.6862, Val Loss: 0.6823
Epoch 10/25, Train Loss: 0.6857, Val Loss: 0.6842
Epoch 11/25, Train Loss: 0.6850, Val Loss: 0.6812
Epoch 12/25, Train Loss: 0.6843, Val Loss: 0.6807
Epoch 13/25, Train Loss: 0.6838, Val Loss: 0.6800
Epoch 14/25, Train Loss: 0.6834, Val Loss: 0.6804
Epoch 15/25, Train Loss: 0.6829, Val Loss: 0.6792
Epoch 16/25, Train Loss: 0.6827, Val Loss: 0.6791
Epoch 17/25, Train Loss: 0.6824, Val Loss: 0.6795
Epoch 18/25, Train Loss: 0.6820, Val Loss: 0.6793
Epoch 19/25, Train Loss: 0.6817, Val Loss: 0.6788
Epoch 20/25, Train Loss: 0.6812, Val 

SVD Training: 100%|██████████| 6/6 [00:00<00:00, 518.77it/s]


Saving models and data...
Models and data ready. Starting recommendation system...

Evaluating model contributions:
Generating CF recommendations for movie_id: 155
Number of sampled users: 100
Number of considered movies: 4401
Number of recommendations generated: 7
Generating CF recommendations for movie_id: 155
Number of sampled users: 100
Number of considered movies: 3958
Number of recommendations generated: 15
Content-based recs: 20
CF recs: 15
NCF recs: 20
LightGBM recs: 20

Model contributions for 'the dark knight':
Content-based recommendations: Batman: Under the Red Hood, Batman Begins, The Dark Knight Rises, Batman: The Killing Joke, Batman Forever, Teenage Mutant Ninja Turtles, Batman, Batman Returns, Watchmen, Superman IV: The Quest for Peace
Collaborative filtering (SVD) recommendations: Once Were Warriors, Three Colors: Red, Solaris, The Million Dollar Hotel, Men in Black II, Terminator 3: Rise of the Machines, Sissi
Neural CF recommendations: Taxi Driver, Family Plot, Cool

Evaluating:   0%|          | 0/5 [00:00<?, ?it/s]

Generating CF recommendations for movie_id: 155
Number of sampled users: 100
Number of considered movies: 4910
Number of recommendations generated: 12
Content-based recs: 20
CF recs: 12
NCF recs: 20
LightGBM recs: 20


Evaluating:  20%|██        | 1/5 [01:24<05:38, 84.70s/it]


Recommendations for 'The Dark Knight':
1. The Dark Knight Rises
2. Batman Begins
3. Taxi Driver
4. Batman: Under the Red Hood
5. Family Plot
6. A View to a Kill
7. Batman: The Killing Joke
8. American Beauty
9. Batman Forever
10. Cool as Ice
Generating CF recommendations for movie_id: 27205
Number of sampled users: 5
Number of considered movies: 4368
Number of recommendations generated: 9
Content-based recs: 20
CF recs: 9
NCF recs: 20
LightGBM recs: 20


Evaluating:  40%|████      | 2/5 [02:26<03:34, 71.35s/it]


Recommendations for 'Inception':
1. Minority Report
2. Last Embrace
3. Noises Off...
4. Mutant Aliens
5. The Anomaly
6. Starcrash
7. I Drink Your Blood
8. Harmful Insect
9. Cyborg 3: The Recycler
10. Salton Sea
Generating CF recommendations for movie_id: 680
Number of sampled users: 100
Number of considered movies: 4928
Number of recommendations generated: 11
Content-based recs: 20
CF recs: 11
NCF recs: 21
LightGBM recs: 20


Evaluating:  60%|██████    | 3/5 [03:52<02:36, 78.11s/it]


Recommendations for 'Pulp Fiction':
1. Jackie Brown
2. Something Wild
3. The Ruling Class
4. Blonde Crazy
5. Aliens
6. Reservoir Dogs
7. Manhattan
8. American Buffalo
9. Nice Dreams
10. Bad Santa 2
Generating CF recommendations for movie_id: 278
Number of sampled users: 100
Number of considered movies: 4590
Number of recommendations generated: 15
Content-based recs: 20
CF recs: 15
NCF recs: 20
LightGBM recs: 20


Evaluating:  80%|████████  | 4/5 [05:19<01:21, 81.31s/it]


Recommendations for 'The Shawshank Redemption':
1. Cool Hand Luke
2. Felon
3. Leaving Las Vegas
4. 20,000 Years in Sing Sing
5. Tie Me Up! Tie Me Down!
6. Fury
7. Milk Money
8. Escape from Alcatraz
9. Snake Eyes
10. Another 48 Hrs.
Generating CF recommendations for movie_id: 13
Number of sampled users: 100
Number of considered movies: 4890
Number of recommendations generated: 12
Content-based recs: 20
CF recs: 12
NCF recs: 20
LightGBM recs: 20


Evaluating: 100%|██████████| 5/5 [06:45<00:00, 81.01s/it]



Recommendations for 'Forrest Gump':
1. Me Before You
2. Nick of Time
3. The Adjustment Bureau
4. Cop Land
5. Rocky II
6. He Was a Quiet Man
7. The Hours
8. Licence to Kill
9. One Hour Photo
10. Annie Hall
Generating CF recommendations for movie_id: 375315
No users found who rated movie_id: 375315
Content-based recs: 20
CF recs: 0
NCF recs: 0
LightGBM recs: 20

Recommendations for 'the salesman':
1. Life+1 Day
2. The Father
3. Through the Olive Trees
4. I Am Taraneh, I Am Fifteen Years Old
5. Circumstance
6. The Beat That My Heart Skipped
7. Under the Skin of the City
8. Black Ice
9. About Elly
10. The Apple
Generating CF recommendations for movie_id: 155
Number of sampled users: 100
Number of considered movies: 4657
Number of recommendations generated: 13
Content-based recs: 20
CF recs: 13
NCF recs: 20
LightGBM recs: 20

Recommendations for 'the dark knight':
1. The Dark Knight Rises
2. Batman Begins
3. Batman Returns
4. Taxi Driver
5. Batman: Under the Red Hood
6. Family Plot
7. A Vi