In [1]:
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE


In [22]:
class DataPreprocessor:
    def __init__(self, config):
        self.config = config
    RAW_DATA_PATH = '/content/bollywood_data_set.csv'

    def load_data(self):
        df = pd.read_csv(self.config.RAW_DATA_PATH)
        return df

In [3]:
    def _extract_year(self, text):

        text = str(text)
        # Try extracting year from parentheses
        parentheses_match = re.search(r'\((\d{4})\)', text)
        if parentheses_match:
            return int(parentheses_match.group(1))

        # Try extracting year with hyphen
        hyphen_match = re.search(r'-(\d{4})', text)
        if hyphen_match:
            return int(hyphen_match.group(1))

        # Try direct year match
        direct_match = re.search(r'\b(\d{4})\b', text)
        if direct_match:
            return int(direct_match.group(1))

        # If no year found, return NaN
        return np.nan


In [4]:
    def _encode_categorial(self, series, delimiter=None, top_n=50):
        if delimiter:
            series= series.str.split(delimiter)

        all_categories = series.explode().value_counts()

        top_categories = all_categories.head(top_n).index.tolist()

        category_mapping = {cat: idx for idx, cat in enumerate(top_categories)}

        def encode_entry(entry):
            if isinstance(entry, list):

                return [category_mapping.get(cat, -1) for cat in entry if cat in category_mapping]
            else:

                return category_mapping.get(entry, -1)


        encoded = series.apply(encode_entry)

        return encoded


In [26]:
    def preprocess_data(self, df):
        """Comprehensive data preprocessing"""
        # Clean column names
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

        # Extract year from movie name
        df['year_of_release'] = df['movie_name'].apply(self._extract_year)

        # Clean and convert runtime
        df['runtime'] = df['runtime'].astype(str)  # Ensure string type
        df['runtime'] = df['runtime'].str.replace(' min', '')  # Remove 'min'
        df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')  # Convert to numeric, invalid entries become NaN

        df['no_of_votes'] = df['no_of_votes'].astype(str).str.replace(',', '').str.strip()

    # Convert to numeric, handling errors
        df['no_of_votes'] = pd.to_numeric(df['no_of_votes'], errors='coerce')

    # Fill NaN values with 0 or another appropriate default
        df['no_of_votes'] = df['no_of_votes'].fillna(0).astype(int)


        # Handle missing years
        current_year = pd.Timestamp.now().year
        df['year_of_release'] = df['year_of_release'].apply(
            lambda x: x if pd.notnull(x) and 1900 <= x <= current_year else np.nan
        )

        # Impute missing years with median
        median_year = df['year_of_release'].median()
        df['year_of_release'] = df['year_of_release'].fillna(median_year)

        # Numeric columns for imputation
        numeric_columns = ['year_of_release', 'runtime', 'imdb_rating', 'no_of_votes']
        text_columns = ['movie_name', 'plot_description', 'director', 'actors']

        # Replace empty strings with NaN for numeric columns
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        # Numeric imputation using median
        numeric_imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

        # Text column cleaning: Replace NaN with 'Unknown'
        for col in text_columns:
            df[col] = df[col].fillna('Unknown')
            df[col] = df[col].str.strip()

        # Feature engineering
        df['runtime_minutes'] = df['runtime'].astype(float)

        # Encode categorical features
        df['directors_encoded'] = self._encode_categorical(df['director'])
        df['actors_encoded'] = self._encode_categorical(df['actors'], delimiter='|')

        # Normalize numeric features
        scaler = StandardScaler()
        df[['normalized_rating', 'normalized_votes']] = scaler.fit_transform(
            df[['imdb_rating', 'no_of_votes']]
        )

        return df

In [6]:
    def _encode_categorical(self, series, delimiter=None):
        """Encode categorical variables"""
        if delimiter:
            series = series.str.split(delimiter)

        mlb = MultiLabelBinarizer()
        encoded = mlb.fit_transform(series)
        return encoded


In [7]:
    def balance_dataset(self, df):
        """Apply SMOTE for balancing the dataset"""
        X = df[['normalized_rating', 'normalized_votes']]
        y = df['imdb_rating']

        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
        balanced_df['imdb_rating'] = y_resampled

        return balanced_df

In [8]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

In [9]:
class TextFeatureExtractor:
    def __init__(self, model_name='ai4bharat/indic-bert'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def extract_text_features(self, texts):
        """Extract deep semantic features from text"""
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = self.model(**inputs)

        return outputs.last_hidden_state.mean(dim=1)

In [10]:
class CNNFeatureExtractor(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.conv1d = nn.Conv1d(
            in_channels=input_dim,
            out_channels=embedding_dim,
            kernel_size=3,
            padding=1
        )
        self.relu = nn.LeakyReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

    def forward(self, x):
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        return x

In [12]:
pip install neo4j

Collecting neo4j
  Downloading neo4j-5.27.0-py3-none-any.whl.metadata (5.9 kB)
Downloading neo4j-5.27.0-py3-none-any.whl (301 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/301.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/301.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.7/301.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neo4j
Successfully installed neo4j-5.27.0


In [13]:
from neo4j import GraphDatabase
import networkx as nx
import numpy as np

In [15]:
class GraphConstructor:
    def __init__(self, config):
        self.driver = GraphDatabase.driver(
            config.NEO4J_URI,
            auth=(config.NEO4J_USERNAME, config.NEO4J_PASSWORD)
        )

    def create_movie_graph(self, movies_df):
        """Construct graph representation of movies"""
        G = nx.Graph()

        # Add movie nodes
        for _, movie in movies_df.iterrows():
            G.add_node(
                movie['movie_name'],
                type='movie',
                rating=movie['imdb_rating'],
                year=movie['year_of_release']
            )

        # Add edges based on similarity
        similarity_matrix = self._compute_movie_similarity(movies_df)

        for i in range(len(movies_df)):
            for j in range(i+1, len(movies_df)):
                if similarity_matrix[i, j] > 0.7:  # Similarity threshold
                    G.add_edge(
                        movies_df.iloc[i]['movie_name'],
                        movies_df.iloc[j]['movie_name'],
                        weight=similarity_matrix[i, j]
                    )

        return G

    def _compute_movie_similarity(self, movies_df):
        """Compute cosine similarity between movies"""
        features = movies_df[['normalized_rating', 'normalized_votes']].values

        # Cosine similarity
        norm = np.linalg.norm(features, axis=1)
        similarity = np.dot(features, features.T) / (norm[:, None] * norm[None, :])

        return similarity

    def save_to_neo4j(self, graph):
        """Save graph to Neo4j database"""
        with self.driver.session() as session:
            # Clear existing data
            session.run("MATCH (n) DETACH DELETE n")

            # Create movie nodes
            for node, data in graph.nodes(data=True):
                session.run(
                    "CREATE (m:Movie {name: $name, rating: $rating, year: $year})",
                    name=node,
                    rating=data.get('rating', 0),
                    year=data.get('year', 0)
                )

            # Create movie edges
            for u, v, data in graph.edges(data=True):
                session.run(
                    "MATCH (a:Movie {name: $name1}), (b:Movie {name: $name2}) "
                    "CREATE (a)-[:SIMILAR {weight: $weight}]->(b)",
                    name1=u, name2=v, weight=data.get('weight', 0)
                )

In [16]:
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
class BollywoodRecommender:
    def __init__(self, graph, feature_extractor):
        self.graph = graph
        self.feature_extractor = feature_extractor

    def get_recommendations(self, base_movie, top_k=3):
        """Generate movie recommendations"""
        if base_movie not in self.graph.nodes:
            raise ValueError(f"Movie {base_movie} not found in graph")

        # Content-based similarity
        content_candidates = self._content_based_recommendation(base_movie)

        # Graph-based recommendation
        graph_candidates = self._graph_based_recommendation(base_movie)

        # Hybrid recommendation
        recommendations = self._merge_recommendations(
            content_candidates,
            graph_candidates,
            top_k
        )

        return recommendations

    def _content_based_recommendation(self, base_movie):
        """Recommend based on content similarity"""
        candidates = {}
        for movie in self.graph.nodes:
            if movie != base_movie:
                similarity = self._compute_content_similarity(base_movie, movie)
                candidates[movie] = similarity

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _graph_based_recommendation(self, base_movie):
        """Recommend based on graph proximity"""
        # Find movies within 2 hops
        candidates = {}
        for movie in nx.single_source_shortest_path_length(
            self.graph, base_movie, cutoff=2
        ).keys():
            if movie != base_movie:
                candidates[movie] = self.graph[base_movie][movie]['weight']

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _merge_recommendations(self, content_candidates, graph_candidates, top_k):
        """Merge and re-rank recommendations"""
        merged_candidates = {}

        # Weight content and graph recommendations
        for movie, score in content_candidates:
            merged_candidates[movie] = 0.6 * score

        for movie, score in graph_candidates:
            merged_candidates[movie] = merged_candidates.get(movie, 0) + 0.4 * score

        # Sort and return top K
        top_recommendations = sorted(
            merged_candidates.items(),
            key=lambda x: x[1],
            reverse=True
        )[:top_k]

        return [movie for movie, _ in top_recommendations]

    def _compute_content_similarity(self, movie1, movie2):
        """Compute content similarity between two movies"""
        # This is a placeholder - replace with actual feature comparison
        return cosine_similarity(
            self.feature_extractor.extract_text_features([movie1]),
            self.feature_extractor.extract_text_features([movie2])
        )[0][0]

In [28]:

# Configuration settings for the recommendation engine
class Config:
    # Data paths
    RAW_DATA_PATH = '/content/bollywood_data_set.csv'
    PROCESSED_DATA_PATH = '/content/bollywood_data_set.csv'

    # Neo4j Database Configuration
    NEO4J_URI = 'bolt://localhost:7687'
    NEO4J_USERNAME = 'neo4j'
    NEO4J_PASSWORD = '12345678'

    # Model Hyperparameters
    EMBEDDING_DIM = 128
    CNN_FILTER_SIZE = 3
    GNN_LAYERS = 2

    # Recommendation Parameters
    TOP_K_RECOMMENDATIONS = 3
    SIMILARITY_THRESHOLD = 0.7



In [37]:
def main():
    # Initialize configuration
    config = Config()

    # Data Preprocessing
    preprocessor = DataPreprocessor(config)
    raw_data = preprocessor.load_data()
    processed_data = preprocessor.preprocess_data(raw_data)
    balanced_data = preprocessor.balance_dataset(processed_data)

    # Feature Extraction
    feature_extractor = TextFeatureExtractor()

    # Graph Construction
    graph_constructor = GraphConstructor(config)
    movie_graph = graph_constructor.create_movie_graph(balanced_data)
    graph_constructor.save_to_neo4j(movie_graph)

    # Recommendation Engine
    recommender = BollywoodRecommender(movie_graph, feature_extractor)

    # Example Recommendation
    base_movie = "3 Idiots"
    recommendations = recommender.get_recommendations(base_movie)

    print(f"Recommendations for {base_movie}:")
    for movie in recommendations:
        print(movie)

if __name__ == "__main__":
    main()

AttributeError: 'DataPreprocessor' object has no attribute 'preprocess_data'

In [40]:
# Install required packages
!pip install torch torchvision torchaudio
!pip install dgl dgllife
!pip install transformers
!pip install neo4j
!pip install networkx
!pip install scikit-learn
!pip install pandas numpy
!pip install imblearn






Collecting dgllife
  Downloading dgllife-0.3.2-py3-none-any.whl.metadata (667 bytes)
Downloading dgllife-0.3.2-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dgllife
Successfully installed dgllife-0.3.2
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


UsageError: Line magic function `%%writefile` not found.


In [45]:
class Config:
    # Data paths
    RAW_DATA_PATH = '/content/bollywood_recommender/data/bollywood_data_set.csv'
    PROCESSED_DATA_PATH = '/content/bollywood_recommender/data/processed_movies.csv'

    # Neo4j Database Configuration
    NEO4J_URI = 'bolt://localhost:7687'
    NEO4J_USERNAME = 'neo4j'
    NEO4J_PASSWORD = '12345678'

    # Model Hyperparameters
    EMBEDDING_DIM = 128
    CNN_FILTER_SIZE = 3
    GNN_LAYERS = 2

    # Recommendation Parameters
    TOP_K_RECOMMENDATIONS = 3
    SIMILARITY_THRESHOLD = 0.7


In [46]:


# Data Preprocessing

import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

class DataPreprocessor:
    def __init__(self, config):
        self.config = config

    def load_data(self):
        df = pd.read_csv(self.config.RAW_DATA_PATH)
        return df

    def _extract_year(self, text):
        text = str(text)
        # Try extracting year from parentheses
        parentheses_match = re.search(r'\((\d{4})\)', text)
        if parentheses_match:
            return int(parentheses_match.group(1))

        # Try extracting year with hyphen
        hyphen_match = re.search(r'-(\d{4})', text)
        if hyphen_match:
            return int(hyphen_match.group(1))

        # Try direct year match
        direct_match = re.search(r'\b(\d{4})\b', text)
        if direct_match:
            return int(direct_match.group(1))

        return np.nan

    def _encode_categorical(self, series, delimiter=None, top_n=50):
        if delimiter:
            series = series.str.split(delimiter)

        all_categories = series.explode().value_counts()
        top_categories = all_categories.head(top_n).index.tolist()
        category_mapping = {cat: idx for idx, cat in enumerate(top_categories)}

        def encode_entry(entry):
            if isinstance(entry, list):
                return [category_mapping.get(cat, -1) for cat in entry if cat in category_mapping]
            else:
                return category_mapping.get(entry, -1)

        encoded = series.apply(encode_entry)
        return encoded

    def preprocess_data(self, df):
        # Clean column names
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

        # Extract year from movie name
        df['year_of_release'] = df['movie_name'].apply(self._extract_year)

        # Clean and convert runtime
        df['runtime'] = df['runtime'].astype(str)
        df['runtime'] = df['runtime'].str.replace(' min', '')
        df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')

        # Clean votes
        df['no_of_votes'] = df['no_of_votes'].astype(str).str.replace(',', '').str.strip()
        df['no_of_votes'] = pd.to_numeric(df['no_of_votes'], errors='coerce')
        df['no_of_votes'] = df['no_of_votes'].fillna(0).astype(int)

        # Handle missing years
        current_year = pd.Timestamp.now().year
        df['year_of_release'] = df['year_of_release'].apply(
            lambda x: x if pd.notnull(x) and 1900 <= x <= current_year else np.nan
        )

        median_year = df['year_of_release'].median()
        df['year_of_release'] = df['year_of_release'].fillna(median_year)

        # Define columns for processing
        numeric_columns = ['year_of_release', 'runtime', 'imdb_rating', 'no_of_votes']
        text_columns = ['movie_name', 'plot_description', 'director', 'actors']

        # Process numeric columns
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        numeric_imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

        # Process text columns
        for col in text_columns:
            df[col] = df[col].fillna('Unknown')
            df[col] = df[col].str.strip()

        # Feature engineering
        df['runtime_minutes'] = df['runtime'].astype(float)
        df['directors_encoded'] = self._encode_categorical(df['director'])
        df['actors_encoded'] = self._encode_categorical(df['actors'], delimiter='|')

        # Normalize features
        scaler = StandardScaler()
        df[['normalized_rating', 'normalized_votes']] = scaler.fit_transform(
            df[['imdb_rating', 'no_of_votes']]
        )

        return df

    def balance_dataset(self, df):
        X = df[['normalized_rating', 'normalized_votes']]
        y = df['imdb_rating']

        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
        balanced_df['imdb_rating'] = y_resampled

        return balanced_df



In [47]:
# Feature Extraction

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class TextFeatureExtractor:
    def __init__(self, model_name='ai4bharat/indic-bert'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def extract_text_features(self, texts):
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = self.model(**inputs)

        return outputs.last_hidden_state.mean(dim=1)

class CNNFeatureExtractor(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.conv1d = nn.Conv1d(
            in_channels=input_dim,
            out_channels=embedding_dim,
            kernel_size=3,
            padding=1
        )
        self.relu = nn.LeakyReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

    def forward(self, x):
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        return x



In [49]:
pip install dgl



In [52]:


from neo4j import GraphDatabase
import networkx as nx
import numpy as np

class GraphConstructor:
    def __init__(self, config):
        self.config = config

    def create_movie_graph(self, movies_df):
        G = nx.Graph()

        for _, movie in movies_df.iterrows():
            G.add_node(
                movie['movie_name'],
                type='movie',
                rating=movie['imdb_rating'],
                year=movie['year_of_release']
            )

        similarity_matrix = self._compute_movie_similarity(movies_df)

        for i in range(len(movies_df)):
            for j in range(i+1, len(movies_df)):
                if similarity_matrix[i, j] > self.config.SIMILARITY_THRESHOLD:
                    G.add_edge(
                        movies_df.iloc[i]['movie_name'],
                        movies_df.iloc[j]['movie_name'],
                        weight=similarity_matrix[i, j]
                    )

        return G

    def _compute_movie_similarity(self, movies_df):
        features = movies_df[['normalized_rating', 'normalized_votes']].values
        norm = np.linalg.norm(features, axis=1)
        similarity = np.dot(features, features.T) / (norm[:, None] * norm[None, :])
        return similarity



In [53]:
# Recommendation Engine

import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

class BollywoodRecommender:
    def __init__(self, graph, feature_extractor):
        self.graph = graph
        self.feature_extractor = feature_extractor

    def get_recommendations(self, base_movie, top_k=3):
        if base_movie not in self.graph.nodes:
            raise ValueError(f"Movie {base_movie} not found in graph")

        content_candidates = self._content_based_recommendation(base_movie)
        graph_candidates = self._graph_based_recommendation(base_movie)
        recommendations = self._merge_recommendations(
            content_candidates,
            graph_candidates,
            top_k
        )

        return recommendations

    def _content_based_recommendation(self, base_movie):
        candidates = {}
        for movie in self.graph.nodes:
            if movie != base_movie:
                similarity = self._compute_content_similarity(base_movie, movie)
                candidates[movie] = similarity

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _graph_based_recommendation(self, base_movie):
        candidates = {}
        for movie in nx.single_source_shortest_path_length(
            self.graph, base_movie, cutoff=2
        ).keys():
            if movie != base_movie:
                candidates[movie] = self.graph[base_movie][movie]['weight']

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _merge_recommendations(self, content_candidates, graph_candidates, top_k):
        merged_candidates = {}

        for movie, score in content_candidates:
            merged_candidates[movie] = 0.6 * score

        for movie, score in graph_candidates:
            merged_candidates[movie] = merged_candidates.get(movie, 0) + 0.4 * score

        top_recommendations = sorted(
            merged_candidates.items(),
            key=lambda x: x[1],
            reverse=True
        )[:top_k]

        return [movie for movie, _ in top_recommendations]

    def _compute_content_similarity(self, movie1, movie2):
        return cosine_similarity(
            self.feature_extractor.extract_text_features([movie1]),
            self.feature_extractor.extract_text_features([movie2])
        )[0][0]



In [63]:
import pandas as pd
import torch
import logging

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    return logging.getLogger(__name__)

def main():
    # Initialize configuration and logging
    config = Config()
    logger = setup_logging()

    try:
        logger.info("Starting Bollywood Movie Recommender System")

        # Initialize data preprocessor and load data
        logger.info("Loading and preprocessing data...")
        preprocessor = DataPreprocessor(config)
        raw_data = preprocessor.load_data()
        processed_data = preprocessor.preprocess_data(raw_data)

        # Balance dataset if needed
        logger.info("Balancing dataset...")
        balanced_data = preprocessor.balance_dataset(processed_data)

        # Initialize feature extractor
        logger.info("Initializing feature extraction...")
        feature_extractor = TextFeatureExtractor()

        # Construct movie similarity graph
        logger.info("Constructing movie similarity graph...")
        graph_constructor = GraphConstructor(config)
        movie_graph = graph_constructor.create_movie_graph(balanced_data)

        # Initialize recommendation engine
        logger.info("Initializing recommendation engine...")
        recommender = BollywoodRecommender(movie_graph, feature_extractor)

        # Save processed data
        logger.info("Saving processed data...")
        processed_data.to_csv(config.PROCESSED_DATA_PATH, index=False)

        return recommender, processed_data

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

def get_recommendations(recommender, movie_name, top_k=None):
    """
    Get movie recommendations for a given movie.

    Args:
        recommender: Initialized BollywoodRecommender instance
        movie_name (str): Name of the movie to get recommendations for
        top_k (int, optional): Number of recommendations to return

    Returns:
        list: List of recommended movie names
    """
    try:
        if top_k is None:
            top_k = Config.TOP_K_RECOMMENDATIONS

        recommendations = recommender.get_recommendations(movie_name, top_k=top_k)
        return recommendations

    except ValueError as e:
        logging.error(f"Movie not found: {str(e)}")
        return []
    except Exception as e:
        logging.error(f"Error getting recommendations: {str(e)}")
        return []

if __name__ == "__main__":
    # Initialize the system
    recommender, processed_data = main()

    # Example usage
    print("\nBollywood Movie Recommender System")
    print("----------------------------------")

    # Get some example movies from the dataset
    example_movies = processed_data['movie_name'].head().tolist()

    print("\nAvailable movies (sample):")
    for idx, movie in enumerate(example_movies, 1):
        print(f"{idx}. {movie}")

    # Get recommendations for the first movie
    if example_movies:
        print(f"\nGetting recommendations for: {example_movies[0]}")
        recommendations = get_recommendations(recommender, example_movies[0])

        print("\nRecommended movies:")
        for idx, movie in enumerate(recommendations, 1):
            print(f"{idx}. {movie}")

ERROR:__main__:An error occurred: [Errno 2] No such file or directory: 'data/bollywood_data_set.csv'


FileNotFoundError: [Errno 2] No such file or directory: 'data/bollywood_data_set.csv'