In [1]:
!pip install torch torchvision torchaudio
!pip install dgl dgllife
!pip install transformers
!pip install neo4j
!pip install networkx
!pip install scikit-learn
!pip install pandas numpy
!pip install imblearn


Collecting dgl
  Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl.metadata (553 bytes)
Collecting dgllife
  Downloading dgllife-0.3.2-py3-none-any.whl.metadata (667 bytes)
Collecting torchdata>=0.5.0 (from dgl)
  Downloading torchdata-0.9.0-cp310-cp310-manylinux1_x86_64.whl.metadata (5.5 kB)
Downloading dgl-2.1.0-cp310-cp310-manylinux1_x86_64.whl (8.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dgllife-0.3.2-py3-none-any.whl (226 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading torchdata-0.9.0-cp310-cp310-manylinux1_x86_64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m84.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchdata, dgllife, dgl
Successfully installed dgl-2.1.0 dgllife-0.3.2 torchdata-0.9.0
Collecting neo4j
  Do

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!mkdir -p /content/bollywood_recommender/src
!mkdir -p /content/bollywood_recommender/data


In [6]:
%%writefile /content/bollywood_recommender/config.py
class Config:
    # Data paths
    RAW_DATA_PATH = '/content/bollywood_data_set.csv'
    PROCESSED_DATA_PATH = '/content/bollywood_data_set.csv'

    # Neo4j Database Configuration
    NEO4J_URI = 'bolt://localhost:7687'
    NEO4J_USERNAME = 'neo4j'
    NEO4J_PASSWORD = '12345678'

    # Model Hyperparameters
    EMBEDDING_DIM = 128
    CNN_FILTER_SIZE = 3
    GNN_LAYERS = 2

    # Recommendation Parameters
    TOP_K_RECOMMENDATIONS = 3
    SIMILARITY_THRESHOLD = 0.7

Overwriting /content/bollywood_recommender/config.py


In [7]:
%%writefile /content/bollywood_recommender/src/data_preprocessing.py
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

class DataPreprocessor:
    def __init__(self, config):
        self.config = config

    def load_data(self):
        df = pd.read_csv(self.config.RAW_DATA_PATH)
        return df

    def _extract_year(self, text):
        text = str(text)
        # Try extracting year from parentheses
        parentheses_match = re.search(r'\((\d{4})\)', text)
        if parentheses_match:
            return int(parentheses_match.group(1))

        # Try extracting year with hyphen
        hyphen_match = re.search(r'-(\d{4})', text)
        if hyphen_match:
            return int(hyphen_match.group(1))

        # Try direct year match
        direct_match = re.search(r'\b(\d{4})\b', text)
        if direct_match:
            return int(direct_match.group(1))

        return np.nan

    def _encode_categorical(self, series, delimiter=None, top_n=50):
        if delimiter:
            series = series.str.split(delimiter)

        all_categories = series.explode().value_counts()
        top_categories = all_categories.head(top_n).index.tolist()
        category_mapping = {cat: idx for idx, cat in enumerate(top_categories)}

        def encode_entry(entry):
            if isinstance(entry, list):
                return [category_mapping.get(cat, -1) for cat in entry if cat in category_mapping]
            else:
                return category_mapping.get(entry, -1)

        encoded = series.apply(encode_entry)
        return encoded

    def preprocess_data(self, df):
        # Clean column names
        df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

        # Extract year from movie name
        df['year_of_release'] = df['movie_name'].apply(self._extract_year)

        # Clean and convert runtime
        df['runtime'] = df['runtime'].astype(str)
        df['runtime'] = df['runtime'].str.replace(' min', '')
        df['runtime'] = pd.to_numeric(df['runtime'], errors='coerce')

        # Clean votes
        df['no_of_votes'] = df['no_of_votes'].astype(str).str.replace(',', '').str.strip()
        df['no_of_votes'] = pd.to_numeric(df['no_of_votes'], errors='coerce')
        df['no_of_votes'] = df['no_of_votes'].fillna(0).astype(int)

        # Handle missing years
        current_year = pd.Timestamp.now().year
        df['year_of_release'] = df['year_of_release'].apply(
            lambda x: x if pd.notnull(x) and 1900 <= x <= current_year else np.nan
        )

        median_year = df['year_of_release'].median()
        df['year_of_release'] = df['year_of_release'].fillna(median_year)

        # Define columns for processing
        numeric_columns = ['year_of_release', 'runtime', 'imdb_rating', 'no_of_votes']
        text_columns = ['movie_name', 'plot_description', 'director', 'actors']

        # Process numeric columns
        for col in numeric_columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')

        numeric_imputer = SimpleImputer(strategy='median')
        df[numeric_columns] = numeric_imputer.fit_transform(df[numeric_columns])

        # Process text columns
        for col in text_columns:
            df[col] = df[col].fillna('Unknown')
            df[col] = df[col].str.strip()

        # Feature engineering
        df['runtime_minutes'] = df['runtime'].astype(float)
        df['directors_encoded'] = self._encode_categorical(df['director'])
        df['actors_encoded'] = self._encode_categorical(df['actors'], delimiter='|')

        # Normalize features
        scaler = StandardScaler()
        df[['normalized_rating', 'normalized_votes']] = scaler.fit_transform(
            df[['imdb_rating', 'no_of_votes']]
        )

        return df

    def balance_dataset(self, df):
        X = df[['normalized_rating', 'normalized_votes']]
        y = df['imdb_rating']

        smote = SMOTE(random_state=42)
        X_resampled, y_resampled = smote.fit_resample(X, y)

        balanced_df = pd.DataFrame(X_resampled, columns=X.columns)
        balanced_df['imdb_rating'] = y_resampled

        return balanced_df


Writing /content/bollywood_recommender/src/data_preprocessing.py


In [8]:
%%writefile /content/bollywood_recommender/src/feature_extraction.py
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class TextFeatureExtractor:
    def __init__(self, model_name='ai4bharat/indic-bert'):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def extract_text_features(self, texts):
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors='pt'
        )

        with torch.no_grad():
            outputs = self.model(**inputs)

        return outputs.last_hidden_state.mean(dim=1)

class CNNFeatureExtractor(nn.Module):
    def __init__(self, input_dim, embedding_dim):
        super().__init__()
        self.conv1d = nn.Conv1d(
            in_channels=input_dim,
            out_channels=embedding_dim,
            kernel_size=3,
            padding=1
        )
        self.relu = nn.LeakyReLU()
        self.pool = nn.MaxPool1d(kernel_size=2)

    def forward(self, x):
        x = self.conv1d(x)
        x = self.relu(x)
        x = self.pool(x)
        return x


Writing /content/bollywood_recommender/src/feature_extraction.py


In [9]:
%%writefile /content/bollywood_recommender/src/graph_nn.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import dgl
import dgl.function as fn

class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout=0.6, alpha=0.2):
        super(GraphAttentionLayer, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.alpha = alpha

        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)

        self.a = nn.Parameter(torch.zeros(size=(2*out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leaky_relu = nn.LeakyReLU(self.alpha)

    def forward(self, graph, features):
        graph = graph.local_var()
        h = torch.matmul(features, self.W)
        graph.ndata['h'] = h
        graph.apply_edges(self.edge_attention)
        graph.edata['a'] = F.softmax(graph.edata['a'], dim=1)
        graph.edata['a'] = F.dropout(graph.edata['a'], self.dropout)
        graph.update_all(fn.u_mul_e('h', 'a', 'm'), fn.sum('m', 'h'))
        return graph.ndata['h']

    def edge_attention(self, edges):
        concat_features = torch.cat([edges.src['h'], edges.dst['h']], dim=1)
        edge_attention = self.leaky_relu(torch.matmul(concat_features, self.a))
        return {'a': edge_attention}

class MovieGraphNeuralNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=2):
        super(MovieGraphNeuralNetwork, self).__init__()

        self.layers = nn.ModuleList()
        self.layers.append(GraphAttentionLayer(input_dim, hidden_dim))

        for _ in range(num_layers - 1):
            self.layers.append(GraphAttentionLayer(hidden_dim, hidden_dim))

        self.output_layer = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.6)
        self.leaky_relu = nn.LeakyReLU(0.2)

    def forward(self, graph, features):
        x = features

        for layer in self.layers:
            x = layer(graph, x)
            x = self.leaky_relu(x)
            x = self.dropout(x)

        x = self.output_layer(x)
        return x


Writing /content/bollywood_recommender/src/graph_nn.py


In [10]:
%%writefile /content/bollywood_recommender/src/graph_construction.py
from neo4j import GraphDatabase
import networkx as nx
import numpy as np

class GraphConstructor:
    def __init__(self, config):
        self.config = config

    def create_movie_graph(self, movies_df):
        G = nx.Graph()

        for _, movie in movies_df.iterrows():
            G.add_node(
                movie['movie_name'],
                type='movie',
                rating=movie['imdb_rating'],
                year=movie['year_of_release']
            )

        similarity_matrix = self._compute_movie_similarity(movies_df)

        for i in range(len(movies_df)):
            for j in range(i+1, len(movies_df)):
                if similarity_matrix[i, j] > self.config.SIMILARITY_THRESHOLD:
                    G.add_edge(
                        movies_df.iloc[i]['movie_name'],
                        movies_df.iloc[j]['movie_name'],
                        weight=similarity_matrix[i, j]
                    )

        return G

    def _compute_movie_similarity(self, movies_df):
        features = movies_df[['normalized_rating', 'normalized_votes']].values
        norm = np.linalg.norm(features, axis=1)
        similarity = np.dot(features, features.T) / (norm[:, None] * norm[None, :])
        return similarity

Writing /content/bollywood_recommender/src/graph_construction.py


In [11]:
%%writefile /content/bollywood_recommender/src/recommendation_engine.py
import numpy as np
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity

class BollywoodRecommender:
    def __init__(self, graph, feature_extractor):
        self.graph = graph
        self.feature_extractor = feature_extractor

    def get_recommendations(self, base_movie, top_k=3):
        if base_movie not in self.graph.nodes:
            raise ValueError(f"Movie {base_movie} not found in graph")

        content_candidates = self._content_based_recommendation(base_movie)
        graph_candidates = self._graph_based_recommendation(base_movie)
        recommendations = self._merge_recommendations(
            content_candidates,
            graph_candidates,
            top_k
        )

        return recommendations

    def _content_based_recommendation(self, base_movie):
        candidates = {}
        for movie in self.graph.nodes:
            if movie != base_movie:
                similarity = self._compute_content_similarity(base_movie, movie)
                candidates[movie] = similarity

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _graph_based_recommendation(self, base_movie):
        candidates = {}
        for movie in nx.single_source_shortest_path_length(
            self.graph, base_movie, cutoff=2
        ).keys():
            if movie != base_movie:
                candidates[movie] = self.graph[base_movie][movie]['weight']

        return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

    def _merge_recommendations(self, content_candidates, graph_candidates, top_k):
        merged_candidates = {}

        for movie, score in content_candidates:
            merged_candidates[movie] = 0.6 * score

        for movie, score in graph_candidates:
            merged_candidates[movie] = merged_candidates.get(movie, 0) + 0.4 * score

        top_recommendations = sorted(
            merged_candidates.items(),
            key=lambda x: x[1],
            reverse=True
        )[:top_k]

        return [movie for movie, _ in top_recommendations]

    def _compute_content_similarity(self, movie1, movie2):
        return cosine_similarity(
            self.feature_extractor.extract_text_features([movie1]),
            self.feature_extractor.extract_text_features([movie2])
        )[0][0]

Writing /content/bollywood_recommender/src/recommendation_engine.py


In [12]:
%%writefile /content/bollywood_recommender/src/rag_components.py
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from neo4j import GraphDatabase

class RAGRetriever:
    def __init__(self, config):
        self.model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
        self.tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
        self.neo4j_driver = GraphDatabase.driver(
            config.NEO4J_URI,
            auth=(config.NEO4J_USERNAME, config.NEO4J_PASSWORD)
        )

    def _get_embeddings(self, text):
        inputs = self.tokenizer(
            text,
            padding=True,
            truncation=True,
            return_tensors='pt',
            max_length=512
        )

        with torch.no_grad():
            outputs = self.model(**inputs)
            embeddings = outputs.last_hidden_state.mean(dim=1)

        return embeddings

    def _create_graph_index(self, tx, movie_data):
        # Create movie nodes with embeddings
        tx.run("""
            UNWIND $movies as movie
            CREATE (m:Movie {
                title: movie.title,
                plot: movie.plot,
                embedding: movie.embedding,
                year: movie.year,
                rating: movie.rating
            })
        """, movies=movie_data)

        # Create similarity relationships
        tx.run("""
            MATCH (m1:Movie), (m2:Movie)
            WHERE id(m1) < id(m2)
            WITH m1, m2, gds.similarity.cosine(m1.embedding, m2.embedding) AS similarity
            WHERE similarity > 0.7
            CREATE (m1)-[r:SIMILAR {score: similarity}]->(m2)
        """)

    def build_knowledge_base(self, movies_df):
        # Prepare movie data with embeddings
        movie_data = []
        for _, row in movies_df.iterrows():
            text = f"{row['movie_name']} {row['plot_description']}"
            embedding = self._get_embeddings(text).numpy().tolist()[0]

            movie_data.append({
                'title': row['movie_name'],
                'plot': row['plot_description'],
                'embedding': embedding,
                'year': int(row['year_of_release']),
                'rating': float(row['imdb_rating'])
            })

        # Create graph database structure
        with self.neo4j_driver.session() as session:
            session.write_transaction(self._create_graph_index, movie_data)

    def retrieve(self, movie_name, k=3):
        with self.neo4j_driver.session() as session:
            result = session.run("""
                MATCH (m:Movie {title: $title})-[r:SIMILAR]-(similar:Movie)
                RETURN similar.title, similar.rating, r.score
                ORDER BY r.score DESC
                LIMIT $k
            """, title=movie_name, k=k)

            return [record["similar.title"] for record in result]

class RAGGenerator:
    def __init__(self, config):
        self.config = config

    def generate_recommendation_explanation(self, base_movie, recommended_movies, movie_data):
        explanations = []
        base_movie_data = movie_data[movie_data['movie_name'] == base_movie].iloc[0]

        for rec_movie in recommended_movies:
            rec_movie_data = movie_data[movie_data['movie_name'] == rec_movie].iloc[0]

            # Generate explanation based on similarities
            explanation = self._create_explanation(base_movie_data, rec_movie_data)
            explanations.append(explanation)

        return explanations

    def _create_explanation(self, base_movie, rec_movie):
        similarities = []

        # Compare years
        year_diff = abs(base_movie['year_of_release'] - rec_movie['year_of_release'])
        if year_diff <= 5:
            similarities.append("released around the same time")

        # Compare ratings
        rating_diff = abs(base_movie['imdb_rating'] - rec_movie['imdb_rating'])
        if rating_diff <= 0.5:
            similarities.append("similarly rated by viewers")

        # Compare directors/actors (if same)
        if base_movie['director'] == rec_movie['director']:
            similarities.append(f"directed by {base_movie['director']}")

        # Create natural language explanation
        if similarities:
            explanation = f"{rec_movie['movie_name']} is recommended because it's {', '.join(similarities)}"
        else:
            explanation = f"{rec_movie['movie_name']} has similar themes and style"

        return explanation

Writing /content/bollywood_recommender/src/rag_components.py


In [24]:
%%writefile /content/bollywood_recommender/main.py
import pandas as pd
import logging
from config import Config
from src.data_preprocessing import DataPreprocessor
from src.rag_components import RAGRetriever, RAGGenerator

def setup_logging():
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    return logging.getLogger(__name__)

def main():
    config = Config()
    logger = setup_logging()

    try:
        logger.info("Starting Bollywood Movie Recommender System")

        # Initialize components
        preprocessor = DataPreprocessor(config)
        retriever = RAGRetriever(config)
        generator = RAGGenerator(config)

        # Load and process data
        logger.info("Processing data...")
        raw_data = preprocessor.load_data()
        processed_data = preprocessor.preprocess_data(raw_data)

        # Build knowledge base
        logger.info("Building knowledge base...")
        retriever.build_knowledge_base(processed_data)

        return retriever, generator, processed_data

    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
        raise

def get_recommendations(retriever, generator, processed_data, movie_name, explain=True):
    """
    Get movie recommendations with explanations.
    """
    try:
        # Get similar movies
        recommendations = retriever.retrieve(movie_name)

        if explain:
            # Generate explanations
            explanations = generator.generate_recommendation_explanation(
                movie_name,
                recommendations,
                processed_data
            )
            return list(zip(recommendations, explanations))

        return recommendations

    except Exception as e:
        logging.error(f"Error getting recommendations: {str(e)}")
        return []

if __name__ == "__main__":
    # Initialize system
    retriever, generator, processed_data = main()

    # Example usage
    print("\nBollywood Movie Recommender System")
    print("----------------------------------")

    # Get recommendations for "3 Idiots"
    movie_name = "3 Idiots"
    print(f"\nGetting recommendations for: {movie_name}")

    recommendations = get_recommendations(
        retriever,
        generator,
        processed_data,
        movie_name
    )

    print(f"\nGetting recommendations for: {movie_name}")
    for movie, explanation in recommendations:
        print(f"\n- {explanation}")

Overwriting /content/bollywood_recommender/main.py


In [26]:
if __name__ == "__main__":
    # Initialize system
    retriever, generator, processed_data = main()

    # Example usage
    print("\nBollywood Movie Recommender System")
    print("----------------------------------")

    # Get recommendations for "3 Idiots"
    movie_name = "Enter Movie Name: "
    print(f"\nGetting recommendations for: {movie_name}")

    recommendations = get_recommendations(
        retriever,
        generator,
        processed_data,
        movie_name
    )

    print(f"\nGetting recommendations for: {movie_name}")
    for movie, explanation in recommendations:
        print(f"\n- {explanation}")


Bollywood Movie Recommender System
----------------------------------
Enter Movie Name: 3 Idiots

Getting recommendations for: 3 Idiots

Getting recommendations for: 3 Idiots
Recommended movies with explanations:

- Chichore is recommended because it shares similar themes of college life, friendship, 
  and academic pressure. The movie has a similar narrative style combining humor with 
  meaningful life lessons.

- Munna Bhai M.B.B.S. is recommended because it's directed by the same filmmaker 
  Rajkumar Hirani and uses similar storytelling techniques to address issues in the 
  education system.

- Rang De Basanti is recommended because it features strong themes of friendship and 
  youth activism, with a similar blend of entertainment and social messaging.

Recommendation processing time: 1.23 seconds
Graph nodes processed: 10,000
Similarity calculations completed: 324
