# Verifying import

In [14]:
def get_node_counts():
    query = """
    MATCH (n)
    RETURN labels(n) AS Labels, COUNT(*) AS Count
    """
    df = run_query(query)
    return df

node_counts = get_node_counts()
print(node_counts)

       Labels  Count
0     [Movie]   9765
1     [Actor]  12643
2  [Director]   3672
3     [Genre]     22
4   [Keyword]  29519


In [15]:
def get_relationship_counts():
    query = """
    MATCH ()-[r]->()
    RETURN type(r) AS RelationshipType, COUNT(*) AS Count
    """
    df = run_query(query)
    return df

relationship_counts = get_relationship_counts()
print(relationship_counts)

   RelationshipType  Count
0  BELONGS_TO_GENRE  19319
1       HAS_KEYWORD  43696
2        SIMILAR_TO  28334
3          ACTED_IN  44962
4          DIRECTED   9973


# Recommendations

In [8]:
def run_query(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        # Convert the result to a DataFrame
        df = pd.DataFrame([dict(record) for record in result])
    return df

In [30]:
def get_actor_based_recommendations(movie_title, top_n=3):
    query = """
    MATCH (m:Movie {title: $movie_title})<-[:ACTED_IN]-(a:Actor)-[:ACTED_IN]->(rec:Movie)
    WHERE m <> rec
    WITH rec, COUNT(a) AS shared_actors
    RETURN rec.title AS RecommendedMovie,
           rec.rating AS Rating,
           rec.year AS Year,
           shared_actors
    ORDER BY shared_actors DESC, rec.rating DESC
    LIMIT $top_n
    """
    parameters = {'movie_title': movie_title, 'top_n': top_n}
    df = run_query(query, parameters)
    return df

In [31]:
movie_title = "PK"
top_n = 3

In [32]:
actor_recommendations = get_actor_based_recommendations(movie_title, top_n)
print("\nActor-Based Recommendations:")
print(actor_recommendations)


Actor-Based Recommendations:
       RecommendedMovie  Rating  Year  shared_actors
0   Munna Bhai M.B.B.S.     8.1  None              3
1  Lage Raho Munna Bhai     8.0  None              3
2              3 Idiots     8.4  None              2


In [9]:
def get_similar_movies(movie_title, top_n=5):
    query = """
    MATCH (m:Movie {title: $movie_title})-[:SIMILAR_TO]->(rec:Movie)
    RETURN rec.title AS RecommendedMovie,
           rec.rating AS Rating,
           rec.year AS Year,
           rec.plot AS Plot,
           rec.imdb_id AS IMDbID
    ORDER BY rec.rating DESC
    LIMIT $top_n
    """
    parameters = {'movie_title': movie_title, 'top_n': top_n}
    df = run_query(query, parameters)
    return df

# Final Recommendations App

In [None]:
# Install required packages
!pip install neo4j scikit-learn nltk

In [35]:
# Import libraries
from neo4j import GraphDatabase
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

NEO4J_URI = ""
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = ""

driver = GraphDatabase.driver(NEO4J_URI, auth=(NEO4J_USER, NEO4J_PASSWORD))

# Helper functions
def run_query(query, parameters=None):
    with driver.session() as session:
        result = session.run(query, parameters)
        df = pd.DataFrame([dict(record) for record in result])
    return df

stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

def fetch_movie_plots():
    query = """
    MATCH (m:Movie)
    RETURN m.imdb_id AS imdb_id, m.title AS title, m.plot AS plot
    """
    df = run_query(query)
    return df

def fetch_movie_genres():
    query = """
    MATCH (m:Movie)-[:BELONGS_TO_GENRE]->(g:Genre)
    RETURN m.imdb_id AS imdb_id, collect(g.name) AS genres
    """
    df = run_query(query)
    return df

def fetch_movie_actors():
    query = """
    MATCH (m:Movie)<-[:ACTED_IN]-(a:Actor)
    RETURN m.imdb_id AS imdb_id, collect(a.name) AS actors
    """
    df = run_query(query)
    return df

def fetch_movie_directors():
    query = """
    MATCH (m:Movie)<-[:DIRECTED]-(d:Director)
    RETURN m.imdb_id AS imdb_id, collect(d.name) AS directors
    """
    df = run_query(query)
    return df

def fetch_movie_keywords():
    query = """
    MATCH (m:Movie)-[:HAS_KEYWORD]->(k:Keyword)
    RETURN m.imdb_id AS imdb_id, collect(k.name) AS keywords
    """
    df = run_query(query)
    return df

# Fetch data
plots_df = fetch_movie_plots()
genres_df = fetch_movie_genres()
actors_df = fetch_movie_actors()
directors_df = fetch_movie_directors()
keywords_df = fetch_movie_keywords()

# Merge dataframes
movies_df = plots_df.merge(genres_df, on='imdb_id', how='left')
movies_df = movies_df.merge(actors_df, on='imdb_id', how='left')
movies_df = movies_df.merge(directors_df, on='imdb_id', how='left')
movies_df = movies_df.merge(keywords_df, on='imdb_id', how='left')

# Fill NaN values
movies_df['genres'] = movies_df['genres'].apply(lambda x: x if isinstance(x, list) else [])
movies_df['actors'] = movies_df['actors'].apply(lambda x: x if isinstance(x, list) else [])
movies_df['directors'] = movies_df['directors'].apply(lambda x: x if isinstance(x, list) else [])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: x if isinstance(x, list) else [])
movies_df['plot'] = movies_df['plot'].fillna('')

# Preprocess plots
movies_df['processed_plot'] = movies_df['plot'].apply(preprocess_text)

# Create TF-IDF matrix
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(movies_df['processed_plot'])

# Similarity functions
def compute_similarity_scores(movie_title):
    target_idx = movies_df.index[movies_df['title'].str.lower() == movie_title.lower()]
    if len(target_idx) == 0:
        print(f"Movie '{movie_title}' not found in the database.")
        return pd.DataFrame()
    target_idx = target_idx[0]
    cosine_similarities = cosine_similarity(tfidf_matrix[target_idx:target_idx+1], tfidf_matrix).flatten()
    movies_df['plot_similarity'] = cosine_similarities
    return target_idx

def jaccard_similarity(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    if not set1 or not set2:
        return 0.0
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return float(len(intersection)) / len(union)

def compute_additional_similarities(target_idx):
    target_movie = movies_df.iloc[target_idx]
    movies_df['genre_similarity'] = movies_df['genres'].apply(lambda x: jaccard_similarity(target_movie['genres'], x))
    movies_df['actor_similarity'] = movies_df['actors'].apply(lambda x: jaccard_similarity(target_movie['actors'], x))
    movies_df['director_similarity'] = movies_df['directors'].apply(lambda x: jaccard_similarity(target_movie['directors'], x))
    movies_df['keyword_similarity'] = movies_df['keywords'].apply(lambda x: jaccard_similarity(target_movie['keywords'], x))

weights = {
    'plot_similarity': 0.4,
    'genre_similarity': 0.2,
    'actor_similarity': 0.2,
    'director_similarity': 0.1,
    'keyword_similarity': 0.1
}

def compute_overall_similarity():
    movies_df['overall_similarity'] = (
        weights['plot_similarity'] * movies_df['plot_similarity'] +
        weights['genre_similarity'] * movies_df['genre_similarity'] +
        weights['actor_similarity'] * movies_df['actor_similarity'] +
        weights['director_similarity'] * movies_df['director_similarity'] +
        weights['keyword_similarity'] * movies_df['keyword_similarity']
    )

def get_recommendations(movie_title, top_n=5):
    target_idx = compute_similarity_scores(movie_title)
    if isinstance(target_idx, pd.DataFrame):
        return target_idx  # Empty DataFrame due to movie not found
    compute_additional_similarities(target_idx)
    compute_overall_similarity()
    recommendations = movies_df[movies_df.index != target_idx]
    recommendations = recommendations.sort_values(by='overall_similarity', ascending=False)
    top_recommendations = recommendations[['title', 'overall_similarity', 'plot_similarity',
                                           'genre_similarity', 'actor_similarity',
                                           'director_similarity', 'keyword_similarity']].head(top_n)
    return top_recommendations



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [39]:
# Get recommendations
movie_title = "Dangal"  # Replace with your movie title
top_n = 5
recommendations = get_recommendations(movie_title, top_n)
print(f"Top {top_n} recommendations for '{movie_title}':")
recommendations.head()

Top 5 recommendations for 'Dangal':


Unnamed: 0,title,overall_similarity,plot_similarity,genre_similarity,actor_similarity,director_similarity,keyword_similarity
3740,Shabnam Mausi,0.205361,0.138403,0.75,0.0,0.0,0.0
355,Mary Kom,0.2,0.0,1.0,0.0,0.0,0.0
578,Azhar,0.170582,0.051454,0.75,0.0,0.0,0.0
327,Saala Khadoos,0.166289,0.040721,0.75,0.0,0.0,0.0
3119,Machaan,0.162222,0.0,0.2,0.111111,1.0,0.0
