In [4]:
import django_jupyter
django_jupyter.init()

In [5]:
from movies.models import Movie
from ratings.models import Rating

#### Fetch Movies and Ratings Data

We'll retrieve all movies along with their associated genres and ratings. 
This will give us a comprehensive dataset to start building our recommendation system.

In [31]:
# Modify the data fetching code to include overviews
movies_qs = Movie.objects.all().prefetch_related('genres')
movies_data = [
    {
        "id": movie.id,
        "title": movie.title,
        "overview": movie.overview,
        "genres": [genre.name for genre in movie.genres.all()],
        "average_rating": movie.average_rating
    }
    for movie in movies_qs
]

movies_df = pd.DataFrame(movies_data)

# Display the updated DataFrame to verify the inclusion of overviews
print(movies_df.head())


     id                 title  \
0   708           Two Friends   
1  1082         The Sleepover   
2  2564  A Place at the Table   
3  4713    The Learning Curve   
4  5495         Bloody Sunday   

                                            overview  \
0  Two Sicilian friends, Nunzio and Pino, share t...   
1  The town of Derry has a secret, but no one tol...   
2  Using personal stories, this powerful document...   
3  Paul and Georgia are lovers, soul mates...and ...   
4  The dramatised story of the Irish civil rights...   

                                genres average_rating  
0                     [Drama, Foreign]           5.64  
1                     [Comedy, Horror]           6.49  
2                        [Documentary]           6.72  
3             [Drama, Crime, Thriller]           5.03  
4  [Adventure, Drama, Action, History]           7.35  


#### Data Cleaning

Now, let's ensure there are no missing values in critical columns, especially in the genres column, since it's essential for content-based filtering. 
We'll also check for any anomalies in the average_rating.

In [9]:
# Check for missing values in the DataFrame
print("Missing values before cleanup:")
print(movies_df.isnull().sum())

# Drop rows where genres are missing or empty (if any)
movies_df.dropna(subset=['genres'], inplace=True)
movies_df = movies_df[movies_df['genres'].map(len) > 0]

# Drop rows where overviews are missing or empty (if any)
movies_df.dropna(subset=['overview'], inplace=True)
movies_df = movies_df[movies_df['overview'].str.strip() != '']

# Fill missing average ratings with the mean (or a suitable default like 0)
if movies_df['average_rating'].isnull().any():
    default_rating = movies_df['average_rating'].mean()
    movies_df['average_rating'].fillna(default_rating, inplace=True)

# Check for missing values after cleanup
print("Missing values after cleanup:")
print(movies_df.isnull().sum())

# Display the cleaned DataFrame
print(movies_df.head())


Missing values before cleanup:
id                0
title             0
overview          0
genres            0
average_rating    0
dtype: int64
Missing values after cleanup:
id                0
title             0
overview          0
genres            0
average_rating    0
dtype: int64
     id                 title  \
0   708           Two Friends   
1  1082         The Sleepover   
2  2564  A Place at the Table   
3  4713    The Learning Curve   
4  5495         Bloody Sunday   

                                            overview  \
0  Two Sicilian friends, Nunzio and Pino, share t...   
1  The town of Derry has a secret, but no one tol...   
2  Using personal stories, this powerful document...   
3  Paul and Georgia are lovers, soul mates...and ...   
4  The dramatised story of the Irish civil rights...   

                                genres average_rating  
0                     [Drama, Foreign]           5.64  
1                     [Comedy, Horror]           6.49  
2        

#### Feature Engineering

Next, we'll convert the genres from lists to a format that can be used in machine learning models. 
One-hot encoding is an effective method for this, as it transforms each genre into a binary column, indicating the presence or absence of each genre for a movie.

In [30]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Ensure average_rating is a float
movies_df['average_rating'] = pd.to_numeric(movies_df['average_rating'], errors='coerce')

# Fill NaN values in 'overview' with empty strings
movies_df['overview'] = movies_df['overview'].fillna('')

# Initialize the MultiLabelBinarizer for genres
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(movies_df['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_, index=movies_df.index)

# Initialize the TfidfVectorizer for movie descriptions
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
overview_tfidf = tfidf.fit_transform(movies_df['overview'])

# Convert the TF-IDF results to a DataFrame
overview_tfidf_df = pd.DataFrame(overview_tfidf.toarray(), columns=tfidf.get_feature_names_out(), index=movies_df.index)

# Concatenate all data into the original DataFrame
movies_df = pd.concat([movies_df.drop(['genres', 'overview'], axis=1), genres_df, overview_tfidf_df], axis=1)

# Display the updated DataFrame with encoded genres and processed descriptions
print(movies_df.head())


KeyError: 'overview'

#### Building the Similarity Model

Now that we have our feature-engineered dataset, the next step is to compute similarity scores between movies. We'll use cosine similarity, which is effective for measuring the similarity between two non-zero vectors in a multi-dimensional space (like our genres and TF-IDF vectors).

Here’s what we’ll do:

1. Combine Genre and Overview Features: Since we have features from genres and overviews, we need to combine these into a single matrix that represents all features of a movie.
2. Compute Cosine Similarity: We'll calculate the cosine similarity between every pair of movies based on their combined feature set. This will give us a similarity matrix.
3. Create a Recommendation Function: This function will use the similarity matrix to find and recommend the most similar movies based on a given movie input.

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix from the feature set
similarity_matrix = cosine_similarity(movies_df.drop(['id', 'title', 'average_rating'], axis=1))

# Convert the similarity matrix to a DataFrame for easier handling
similarity_df = pd.DataFrame(similarity_matrix, index=movies_df['id'], columns=movies_df['id'])

# Display a portion of the similarity matrix
print(similarity_df.head())


id       708       1082      2564      4713      5495      5496      5498   \
id                                                                           
708   1.000000  0.000000  0.000000  0.295743  0.258199  0.278200  0.343672   
1082  0.000000  1.000000  0.020224  0.000000  0.000000  0.000000  0.000000   
2564  0.000000  0.020224  1.000000  0.007285  0.000000  0.000000  0.000000   
4713  0.295743  0.000000  0.007285  1.000000  0.223607  0.673999  0.288675   
5495  0.258199  0.000000  0.000000  0.223607  1.000000  0.200000  0.258199   

id       5499      6414      6415   ...     59285     59286  59287  59288  \
id                                  ...                                     
708   0.000000  0.333333  0.000000  ...  0.315172  0.333333    0.0    0.0   
1082  0.408248  0.000000  0.000000  ...  0.000000  0.000000    0.0    0.0   
2564  0.000000  0.000000  0.000000  ...  0.000000  0.000000    0.5    0.5   
4713  0.000000  0.288675  0.223607  ...  0.250000  0.577350    0.0  

#### Create a Recommendation Function

Now, let's write a function that can use this similarity matrix to recommend movies. This function will take a movie ID and the number of recommendations to return, and it will find the most similar movies based on the similarity scores.

In [27]:
def recommend_movies(movie_id, num_recommendations=5):
    if movie_id not in similarity_df.index:
        return f"Movie ID {movie_id} not found in the dataset."
    
    sim_scores = similarity_df.loc[movie_id].sort_values(ascending=False)
    top_indices = sim_scores.iloc[1:num_recommendations+1].index
    
    # Select the relevant movies directly into a new DataFrame
    recommended_movies = movies_df[movies_df['id'].isin(top_indices)][['id', 'title', 'average_rating']]
    
    return recommended_movies

sample_movie_id = movies_df.iloc[0]['id']
recommendations = recommend_movies(sample_movie_id, 5)
print(recommendations.to_string(index=False))


   id                                    title  title average_rating
 9320 Balzac and the Little Chinese Seamstress    0.0           6.77
 9435                             Lost Embrace    0.0           6.45
 9555                    The Keys to the House    0.0           6.46
11319                             Hawaii, Oslo    0.0           6.69
11693                              The Italian    0.0           6.64


In [28]:
# Display DataFrame structure
print(movies_df.dtypes)

# Display the first few rows of the DataFrame to inspect actual values
print(movies_df.head())


id                  int64
title              object
average_rating     object
Action              int64
Adventure           int64
                   ...   
zealand           float64
zombie            float64
zombies           float64
zone              float64
zoo               float64
Length: 5023, dtype: object
     id                 title average_rating  Action  Adventure  Animation  \
0   708           Two Friends           5.64       0          0          0   
1  1082         The Sleepover           6.49       0          0          0   
2  2564  A Place at the Table           6.72       0          0          0   
3  4713    The Learning Curve           5.03       0          0          0   
4  5495         Bloody Sunday           7.35       1          1          0   

   Comedy  Crime  Documentary  Drama  ...  younger  youngest  youth  youthful  \
0       0      0            0      1  ...      0.0       0.0    0.0       0.0   
1       1      0            0      0  ...      0.0     

In [29]:
# Assuming you have a function or code block that constructs the similarity matrix
# Example snippet to display part of the similarity matrix
print(similarity_matrix[:5, :5])  # Adjust as necessary based on actual variable names and sizes


[[1.         0.         0.         0.29574298 0.25819889]
 [0.         1.         0.02022423 0.         0.        ]
 [0.         0.02022423 1.         0.00728507 0.        ]
 [0.29574298 0.         0.00728507 1.         0.2236068 ]
 [0.25819889 0.         0.         0.2236068  1.        ]]
