In [1]:
import django_jupyter
django_jupyter.init()

##### 1.1 Load Movie Metadata
We need to load the movie metadata, such as genres, directors, and plot summaries, from your Django database. Let’s begin by fetching this data.

In [2]:
import pandas as pd
from movies.models import Movie

# Fetch movie metadata from the database
movies = Movie.objects.all().values('id', 'title', 'overview', 'genres__name')

# Convert the queryset to a DataFrame
movies_df = pd.DataFrame(list(movies))

# Display the first few rows to verify
movies_df.head()


Unnamed: 0,id,title,overview,genres__name
0,708,Two Friends,"Two Sicilian friends, Nunzio and Pino, share t...",Drama
1,708,Two Friends,"Two Sicilian friends, Nunzio and Pino, share t...",Foreign
2,712,War Stories Our Mother Never Told Us,Seven New Zealand women speak about their live...,Unknown
3,735,Vermont Is for Lovers,Vermont is for Lovers is an independently prod...,Unknown
4,869,Venice,An atmospheric coming-of-age story featuring a...,Romance


In [3]:
import pandas as pd
from movies.models import Movie, Genre

def get_movie_data():
    # Query all movies
    movies = Movie.objects.all()
    
    # Prepare lists to hold data
    movie_ids = []
    titles = []
    overviews = []
    genres = []
    
    # Loop through each movie to extract necessary details
    for movie in movies:
        movie_ids.append(movie.id)
        titles.append(movie.title)
        overviews.append(movie.overview if movie.overview else "")
        # Extract genre names for each movie
        genres.append([genre.name for genre in movie.genres.all()])
    
    # Create a DataFrame
    movie_data = pd.DataFrame({
        'id': movie_ids,
        'title': titles,
        'overview': overviews,
        'genres': genres
    })
    
    return movie_data

# Run the function to get the movie data
movie_data = get_movie_data()

# Display the first few rows of the DataFrame
print(movie_data.head())


     id                                 title  \
0   708                           Two Friends   
1   712  War Stories Our Mother Never Told Us   
2   735                 Vermont Is for Lovers   
3   869                                Venice   
4  1082                         The Sleepover   

                                            overview            genres  
0  Two Sicilian friends, Nunzio and Pino, share t...  [Drama, Foreign]  
1  Seven New Zealand women speak about their live...         [Unknown]  
2  Vermont is for Lovers is an independently prod...         [Unknown]  
3  An atmospheric coming-of-age story featuring a...  [Romance, Drama]  
4  The town of Derry has a secret, but no one tol...  [Comedy, Horror]  


##### Step 1.2: Feature Extraction and Representation
We will:

Transform the overview (plot summaries) into numerical features using TF-IDF (Term Frequency-Inverse Document Frequency).
Encode the genres using one-hot encoding to represent the categorical data numerically.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# Step 1: Vectorize the movie overviews using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
tfidf_matrix = tfidf_vectorizer.fit_transform(movie_data['overview'].fillna(''))

# Step 2: One-hot encode the genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movie_data['genres'])

# Display the shapes of the resulting matrices to verify the process
print(f"TF-IDF Matrix Shape: {tfidf_matrix.shape}")
print(f"Genre Matrix Shape: {genre_matrix.shape}")


TF-IDF Matrix Shape: (16518, 3000)
Genre Matrix Shape: (16518, 21)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Step 1: Combine the TF-IDF and genre matrices
combined_features = np.hstack([tfidf_matrix.toarray(), genre_matrix])

# Step 2: Compute the cosine similarity between movies
cosine_sim = cosine_similarity(combined_features)

# Step 3: Create a function to recommend similar movies
def get_content_based_recommendations(movie_id, top_n=10):
    # Get the index of the movie that matches the given movie_id
    movie_index = movie_data.index[movie_data['id'] == movie_id][0]

    # Get the pairwise similarity scores of all movies with that movie
    similarity_scores = list(enumerate(cosine_sim[movie_index]))

    # Sort the movies based on the similarity scores
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the top_n most similar movies
    similarity_scores = similarity_scores[1:top_n+1]  # Exclude the movie itself

    # Get the movie indices
    movie_indices = [i[0] for i in similarity_scores]

    # Return the top_n most similar movies
    return movie_data.iloc[movie_indices][['id', 'title', 'overview']]

# Test the recommendation function
recommended_movies = get_content_based_recommendations(movie_id=708, top_n=10)
print(recommended_movies)


          id                    title  \
7122   18800   The Day I Was Not Born   
6421   17546            Help Me, Eros   
1097    6445                  Seaside   
8078   20384                    Udaan   
357     4257             Pandaemonium   
13690  31238        Swimming Upstream   
13248  30545                Rock On!!   
11844  27773                   Amador   
4966   14830  Letters to Father Jacob   
3979   13004                 The Pool   

                                                overview  
7122   During a stopover in Buenos Aires on her way t...  
6421   Ah Jie lost everything in the stock market due...  
1097   Seaside takes place in a small coastal town on...  
8078   After being abandoned for eight straight years...  
357    Set in England during the early 19th century, ...  
13690  This is a heart-wrenching story about a happy-...  
13248  Sakshi Shroff, the lonely and neglected wife o...  
11844  A drama centered on a young woman who takes a ...  
4966   With few o

In [10]:
def inspect_recommendations(movie_id, top_n=10):
    recommended_movies = get_content_based_recommendations(movie_id, top_n=top_n)
    
    print(f"Recommendations for Movie ID: {movie_id}")
    for item in recommended_movies:
        # Check the number of elements in each tuple
        if len(item) == 3:
            movie_id, title, similarity = item
            print(f"Title: {title}, Similarity: {similarity:.4f}")
        elif len(item) == 2:
            movie_id, title = item
            print(f"Title: {title}")
        else:
            print(f"Unexpected format: {item}")

# Inspect recommendations for a few movie IDs
inspect_recommendations(movie_id=708)  # Replace with a valid movie ID from your dataset
inspect_recommendations(movie_id=712)
inspect_recommendations(movie_id=869)


Recommendations for Movie ID: 708
Title: d
Unexpected format: title
Unexpected format: overview
Recommendations for Movie ID: 712
Title: d
Unexpected format: title
Unexpected format: overview
Recommendations for Movie ID: 869
Title: d
Unexpected format: title
Unexpected format: overview


In [13]:
def inspect_recommendations(movie_id, top_n=10):
    recommended_movies = get_content_based_recommendations(movie_id, top_n=top_n)
    
    print(f"Raw recommendations for Movie ID: {movie_id}")
    for item in recommended_movies:
        print(item)

# Inspect recommendations for a few movie IDs
inspect_recommendations(movie_id=708)  # Replace with a valid movie ID from your dataset
inspect_recommendations(movie_id=712)
inspect_recommendations(movie_id=869)


Raw recommendations for Movie ID: 708
(708, 'Movie Title 1', 0.89)
(712, 'Movie Title 2', 0.85)
Raw recommendations for Movie ID: 712
(708, 'Movie Title 1', 0.89)
(712, 'Movie Title 2', 0.85)
Raw recommendations for Movie ID: 869
(708, 'Movie Title 1', 0.89)
(712, 'Movie Title 2', 0.85)


In [15]:
def get_content_based_recommendations(movie_id, top_n=10):
    # Assuming you have some cosine similarity calculation
    idx = movie_data[movie_data['id'] == movie_id].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the indices of the most similar movies
    sim_scores = sim_scores[1:top_n+1]
    
    # Fetch the movie titles and IDs for the recommendations
    movie_indices = [i[0] for i in sim_scores]
    movie_ids = movie_data.iloc[movie_indices]['id']
    movie_titles = movie_data.iloc[movie_indices]['title']
    similarities = [i[1] for i in sim_scores]
    
    # Return movie IDs, titles, and similarity scores
    return list(zip(movie_ids, movie_titles, similarities))


In [16]:
print(movie_data.head())  # This should display the first few rows of your DataFrame to confirm the content


     id                                 title  \
0   708                           Two Friends   
1   712  War Stories Our Mother Never Told Us   
2   735                 Vermont Is for Lovers   
3   869                                Venice   
4  1082                         The Sleepover   

                                            overview            genres  
0  Two Sicilian friends, Nunzio and Pino, share t...  [Drama, Foreign]  
1  Seven New Zealand women speak about their live...         [Unknown]  
2  Vermont is for Lovers is an independently prod...         [Unknown]  
3  An atmospheric coming-of-age story featuring a...  [Romance, Drama]  
4  The town of Derry has a secret, but no one tol...  [Comedy, Horror]  


In [17]:
def get_content_based_recommendations(movie_id, top_n=10):
    # Find the index of the movie that matches the movie_id
    idx = movie_data[movie_data['id'] == movie_id].index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the most similar movies
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar movies with their titles and similarity scores
    movie_ids = movie_data.iloc[movie_indices]['id']
    movie_titles = movie_data.iloc[movie_indices]['title']
    similarities = [i[1] for i in sim_scores]
    
    return list(zip(movie_ids, movie_titles, similarities))


In [18]:
inspect_recommendations(movie_id=708)
inspect_recommendations(movie_id=712)
inspect_recommendations(movie_id=869)


Raw recommendations for Movie ID: 708
(18800, 'The Day I Was Not Born', 0.7152512779931863)
(17546, 'Help Me, Eros', 0.7035222322230169)
(6445, 'Seaside', 0.702364254967664)
(20384, 'Udaan', 0.6985847075365609)
(4257, 'Pandaemonium', 0.6958151035935873)
(31238, 'Swimming Upstream', 0.6957417525493255)
(30545, 'Rock On!!', 0.6937620665100183)
(27773, 'Amador', 0.6918964742329379)
(14830, 'Letters to Father Jacob', 0.6896815710535735)
(13004, 'The Pool', 0.6888374375626181)
Raw recommendations for Movie ID: 712
(16087, 'Ten Skies', 0.7071067811865475)
(28981, 'WWII IN HD', 0.6345886534938537)
(5676, 'The Rising Place', 0.619099773002756)
(29877, 'Kokoko', 0.5942586649160204)
(36029, 'Liquidation', 0.5818690966283939)
(32000, 'The Capsule', 0.5745344499645988)
(22739, 'Los rollos perdidos de Pancho Villa', 0.5728044666854295)
(15577, 'Love in Another Language', 0.5688343585965738)
(34926, 'The Saboteurs', 0.5667821722626579)
(22275, "Moms Mabley: I Got Somethin' to Tell You", 0.5632470911

In [22]:
def get_content_based_recommendations(movie_id, top_n=10):
    # Find the index of the movie that matches the movie_id
    matching_movie = movie_data[movie_data['id'] == movie_id]
    
    if matching_movie.empty:
        print(f"Movie ID {movie_id} not found in the dataset.")
        return []
    
    idx = matching_movie.index[0]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top n most similar movies
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top n most similar movies
    return [(movie_data.iloc[i]['id'], movie_data.iloc[i]['title'], sim_scores[idx][1]) for idx, i in enumerate(movie_indices)]



In [24]:
def evaluate_model(test_data, top_n=10):
    precisions = []
    recalls = []
    
    for user_id, actual_likes in test_data.items():
        recommendations = get_content_based_recommendations(user_id, top_n=top_n)
        recommended_movie_ids = [movie_id for movie_id, _, _ in recommendations]
        
        if len(recommended_movie_ids) == 0:
            print(f"No recommendations available for Movie ID: {user_id}")
            continue  # Skip to the next user/movie if no recommendations are found
        
        # Precision and recall
        true_positives = len(set(recommended_movie_ids) & set(actual_likes))
        
        if len(recommended_movie_ids) > 0:
            precision = true_positives / len(recommended_movie_ids)
        else:
            precision = 0
        
        if len(actual_likes) > 0:
            recall = true_positives / len(actual_likes)
        else:
            recall = 0
        
        precisions.append(precision)
        recalls.append(recall)
    
    # Calculate average precision and recall
    avg_precision = sum(precisions) / len(precisions) if precisions else 0
    avg_recall = sum(recalls) / len(recalls) if recalls else 0
    
    print(f"Average Precision: {avg_precision:.4f}")
    print(f"Average Recall: {avg_recall:.4f}")


In [25]:
# Re-run the evaluation with the updated function
evaluate_model(test_data)


Movie ID 1 not found in the dataset.
No recommendations available for Movie ID: 1
Movie ID 2 not found in the dataset.
No recommendations available for Movie ID: 2
Average Precision: 0.0000
Average Recall: 0.0000


In [27]:
# Assuming the previous steps have been completed and the model has been trained
from sklearn.metrics import precision_score, recall_score
import numpy as np

# Define the test data with valid movie IDs from your dataset
test_data = {
    36184: [3707, 3630, 869],  # User 36184 likes these movies
    3707: [36184, 3630],       # User 3707 likes these movies
    3630: [3707, 36184],       # User 3630 likes these movies
}

# Function to evaluate the content-based recommendation model
def evaluate_model(test_data, top_n=20):
    precisions = []
    recalls = []
    
    for movie_id, actual_likes in test_data.items():
        recommendations = get_content_based_recommendations(movie_id, top_n=top_n)
        recommended_movie_ids = [movie_id for movie_id, _, _ in recommendations]
        
        if not recommended_movie_ids:
            print(f"No recommendations available for Movie ID: {movie_id}")
            continue
        
        # Precision and recall
        true_positives = len(set(recommended_movie_ids) & set(actual_likes))
        precision = true_positives / len(recommended_movie_ids) if recommended_movie_ids else 0
        recall = true_positives / len(actual_likes) if actual_likes else 0
        
        precisions.append(precision)
        recalls.append(recall)
    
    # Calculate and print the average precision and recall
    average_precision = np.mean(precisions) if precisions else 0
    average_recall = np.mean(recalls) if recalls else 0
    
    print(f"Average Precision: {average_precision:.4f}")
    print(f"Average Recall: {average_recall:.4f}")

# Re-run the evaluation with the valid movie IDs
evaluate_model(test_data)


Average Precision: 0.0000
Average Recall: 0.0000
