In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json 
import requests
import pickle
import re
import urllib.parse
import sys
import time
from IPython.display import Image

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, MultiLabelBinarizer, normalize
from sklearn.preprocessing import QuantileTransformer, RobustScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from scipy.sparse import csr_matrix
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
pd.set_option('display.max_columns', 25)

In [2]:
f = open("token_TMDB.txt","r")
token = f.read()
headers = {"accept": "application/json", "Authorization": f"Bearer {token}"}

In [3]:
data= pickle.load(open('merged.pkl', 'rb'))

In [4]:
data['director'] = data['director'].str.replace(' ', '_').str.replace(',', ' ')
data['writers'] = data['writers'].str.replace(' ', '_').str.replace(',', ' ')
data['cast'] = data['cast'].str.replace(' ', '_').str.replace(',', ' ')

In [5]:
def get_description(row):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()

    keywords = row['keywords'] if isinstance(row['keywords'], str) else ''
    actors = row['cast'] if isinstance(row['cast'], str) else ''
    director = row['director'] if isinstance(row['director'], str) else ''
    writers = row['writers'] if isinstance(row['writers'], str) else ''
    crew = ' '.join([director, writers])
    countries = row['production_countries'] if isinstance(row['production_countries'], str) else ''
    overview = row['overview'] if isinstance(row['overview'], str) else ''

    tokens = word_tokenize(overview)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]

    processed_overview_stemmed = ' '.join(stemmed_tokens)
    processed_overview_lemmatized = ' '.join(lemmatized_tokens)

    description = f"{keywords} {processed_overview_stemmed} {actors} {crew} {countries}"

    return description


In [None]:
a = data[data['id']=='tt1375666']

In [None]:
get_description(data[data['id']=='tt1375666'].iloc[0])

In [None]:
descriptions = data.apply(get_description, axis=1)

In [None]:
descriptions.isna().any()

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(descriptions)

In [None]:
%%time
similarity_matrix1 = cosine_similarity(tfidf_matrix)

In [None]:
genres = []
for index, row in data.iterrows():
    genre = row['genres'].split(' ')
    genres.append(genre) 
    
#one hot encoder does not work on list
mlb = MultiLabelBinarizer()
genres_mlb = mlb.fit_transform(genres)
family_index = np.where(mlb.classes_ == "Family")[0]

# Multiply the encoding of "family" genre by 2
if family_index.size > 0:
    genres_mlb[:, family_index] *= 2

print(genres_mlb)

In [None]:
genres

In [None]:
genres_matrix1 = cosine_similarity(genres_mlb)
np.fill_diagonal(genres_matrix1, 0) #nullify movie's genre similarity to itself so that it wouldnt recommend itself

In [None]:
# with open('genres_matrix.pkl', 'wb') as f:
#     pickle.dump(genres_matrix1, f)

In [None]:
# with open('similarity_matrix.pkl', 'wb') as f:
#     pickle.dump(similarity_matrix1, f)

In [None]:
with open('genres_matrix.pkl', 'rb') as f:
    genres_matrix1 = pickle.load(f)

In [None]:
with open('similarity_matrix.pkl', 'rb') as f:
    similarity_matrix1 = pickle.load(f)

In [None]:
scaler_release_year = MinMaxScaler()
release_year_norm = scaler_release_year.fit_transform(data['release_date'].values.reshape(-1, 1))

scaler_ratings = MinMaxScaler()
ratings_norm = scaler_ratings.fit_transform(data['average_rating'].values.reshape(-1, 1))
ratings_norm_scaled = ratings_norm * 2

#scaler_popularity = QuantileTransformer()
#vote_count_data = data['vote_count'].values.reshape(-1, 1)
#popularity_norm = scaler_popularity.fit_transform(vote_count_data)

scaler_popularity = RobustScaler()
popularity_norm = scaler_popularity.fit_transform(data['vote_count'].values.reshape(-1, 1))
popularity_norm_scaled = popularity_norm * 2

combined_features = np.hstack((release_year_norm, ratings_norm_scaled, popularity_norm_scaled))
popularity_matrix1 = cosine_similarity(combined_features)


In [None]:
#similarity_release_year = cosine_similarity(release_year_norm)
#similarity_ratings = cosine_similarity(ratings_norm)


In [None]:
#combined_features = np.hstack((release_year_norm, ratings_norm))
#popularity_matrix1 = cosine_similarity(combined_features)

In [6]:
def movie_recommender_weighted(movie_id, data, similarity_matrix, genre_matrix, 
                               popularity_matrix, top_n=3, genre_weight=0.20):
    # Find movie index
    movie_index = np.where(data['id'] == movie_id)[0]
    if len(movie_index) == 0:
        print("Movie ID not found.")
        return []

    movie_index = movie_index[0]

    # Calculate combined scores efficiently using array operations
    similarity_scores = similarity_matrix[movie_index]
    genre_scores = genre_matrix[movie_index]
    popularity_scores = popularity_matrix[movie_index]
    combined_scores = ((0.7 - genre_weight) * similarity_scores + 
                       genre_weight * genre_scores + 0.3 * popularity_scores)

    # Exclude the movie itself from recommendations
    combined_scores[movie_index] = -np.inf

    # Get indices of top similar movies
    top_indices = np.argpartition(combined_scores, -top_n)[-top_n:]

    # Get movie titles based on indices
    similar_movies = data.iloc[top_indices]['primary_title']

    return similar_movies.tolist()

In [21]:
movie_recommender_weighted('tt0499549', d, similarity_matrix2, genres_matrix2, popularity_matrix2,top_n=10)

['Independence Day',
 'Star Wars Episode IV A New Hope',
 'Star Wars Episode VI Return of the Jedi',
 'Dune',
 'Star Wars Episode V The Empire Strikes Back',
 'Stargate',
 'Rogue One A Star Wars Story',
 'Valerian and the City of a Thousand Planets',
 'Aliens',
 'Star Wars Episode VIII The Last Jedi']

In [22]:
movie_recommender_weighted('tt1375666', d, similarity_matrix2, genres_matrix2, popularity_matrix2,top_n=10)

['The Terminator',
 'The Fifth Element',
 'Total Recall',
 'The Matrix Reloaded',
 'Jurassic World Fallen Kingdom',
 'The Day After Tomorrow',
 'Armageddon',
 'The Matrix',
 'Waterworld',
 'Terminator Salvation']

In [23]:
movie_recommender_weighted('tt1109624', d, similarity_matrix2, genres_matrix2, popularity_matrix2,top_n=10)

['Diary of a Wimpy Kid',
 'Aladdin',
 'Peter Pan',
 'Mary Poppins',
 'Charlie and the Chocolate Factory',
 'Charlotte s Web',
 'Where the Wild Things Are',
 'Night at the Museum',
 'Paddington 2',
 'Mary Poppins Returns']

In [20]:
%%time
similarity_matrix2, genres_matrix2, popularity_matrix2 = calculate_similarity_matrices(d)

CPU times: user 5.99 s, sys: 2.16 s, total: 8.16 s
Wall time: 5.47 s


In [19]:
d = data[(data['average_rating']>6) & (data['vote_count']>10000)]

In [None]:
d

In [18]:
from Scrapping import Scraper

In [26]:
def calculate_similarity_matrices(data):
    
    # Calculate TF-IDF matrix for descriptions
    descriptions = data.apply(get_description, axis=1)
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    similarity_matrix1 = cosine_similarity(tfidf_matrix)

    # Calculate similarity matrix for genres
    genres = []
    for index, row in data.iterrows():
        genre = row['genres'].split(' ')
        genres.append(genre) 

    mlb = MultiLabelBinarizer()
    genres_mlb = mlb.fit_transform(genres)
    family_index = np.where(mlb.classes_ == "Family")[0]
    animation_index = np.where(mlb.classes_ == "Animation")[0]
    if animation_index.size > 0:
        genres_mlb[:, animation_index] *= 2

    if family_index.size > 0:
        genres_mlb[:, family_index] *= 2

    genres_matrix1 = cosine_similarity(genres_mlb)
    np.fill_diagonal(genres_matrix1, 0)

    scaler_release_year = StandardScaler()
    release_year_norm = scaler_release_year.fit_transform(data['release_date'].values.reshape(-1, 1))

    scaler_ratings = StandardScaler()
    ratings_norm = scaler_ratings.fit_transform(data['average_rating'].values.reshape(-1, 1))
    ratings_norm_scaled = ratings_norm * 2

    scaler_popularity = RobustScaler()
    popularity_norm = scaler_popularity.fit_transform(data['vote_count'].values.reshape(-1, 1))
    popularity_norm_scaled = popularity_norm * 2

    combined_features = np.hstack((release_year_norm, ratings_norm_scaled, popularity_norm_scaled))

    popularity_matrix1 = cosine_similarity(combined_features)

    return similarity_matrix1, genres_matrix1, popularity_matrix1

In [27]:
def final_recommender(data):
    title = input('Enter a movie name: ')
    url = f"https://api.themoviedb.org/3/search/movie?query={title}&include_adult=false&language=en-US&page=1"
    res = requests.get(url, headers=headers).json()
    weight = 0.2
    n = 3
    if res and 'results' in res and len(res['results']) > 0:
        release_year = res['results'][0]['release_date'].split('-')[0]
        print('The movie entered:')
        print(res['results'][0]['title'],'-',release_year)
        print('\nSearching database...')
        movie_id = res['results'][0]['id']
        url2 = f"https://api.themoviedb.org/3/movie/{movie_id}/external_ids"
        res2 = requests.get(url2, headers=headers).json()
        imdb_id = res2['imdb_id']
        
        if imdb_id in data['id'].values:
            if not ((data[data['id'] == imdb_id]['average_rating'] > 6) & 
                    (data[data['id'] == imdb_id]['vote_count'] > 5000)).any():
                    data = pd.concat([data[(data['average_rating'] > 6) & (data['vote_count'] > 5000)]
                                      , data[(data['id'] == imdb_id)]])
            else:
                data = data[(data['average_rating'] > 6) & (data['vote_count'] > 5000)]
            
            similarity, genre, popularity = calculate_similarity_matrices(data)
            recommended = movie_recommender_weighted(imdb_id, data, 
                                                     similarity, genre, popularity,
                                                     top_n=n, genre_weight=weight)

            print(f"\nMovies similar to '{res['results'][0]['title']}':")
            
            for i, title in enumerate(recommended, start=1):
                print(f"{i}. {title}")
                rec_id = data[data['primary_title'] == title]['id'].values[0]
                rec_res, poster, tmdb_id = fetch_movie_details([rec_id],headers)
                if not rec_res.empty: 
                    poster_url = 'https://image.tmdb.org/t/p/original'+ poster
                    display(Image(url=poster_url, width=200, height=300))
                lttx = 'https://letterboxd.com/tmdb/'+str(tmdb_id)
                imdb = 'https://www.imdb.com/title/'+str(rec_id)
                print('IMDB link:')
                print(imdb)
                print(f'Overview:\n\n{rec_res["overview"].values[0]}')
                print(f'\nMost popular reviews on Letterboxd about {title}:\n')
                reviews = Scraper.get_reviews_from_link(lttx, num_reviews=3)
                for review in reviews:
                    print(review)
                    print('\n')
                    time.sleep(0.5)
                    
        else:
            print(f"We couldn't find {title} in the database. Please try another movie. ")

    else: 
        print(f"We couldn't find {title} in the database. Please check if there is a spelling mistake.")

In [None]:
final_recommender(data)

Enter a movie name: barbie
The movie entered:
Barbie - 2023

Searching database...


In [None]:
data['average_rating'].describe()

In [9]:
def fetch_movie_details(series, headers):
    df_combined = pd.DataFrame()  
    for movie_id in series:
        url = f"https://api.themoviedb.org/3/movie/{movie_id}?&append_to_response=keywords,credits,external_ids"
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            movie_df, poster, tmdb_id = format_movie_details_tmdb(response)
            df_combined = pd.concat([df_combined, movie_df], ignore_index=True)  
        else:
            print(f"Failed to fetch details for movie with ID: {movie_id}")
                 
    data = df_combined[['id', 'primary_title', 'original_title', 'release_date', 'runtime',
                       'keywords', 'overview', 'production_countries', 'average_rating',
                       'vote_count', 'genres']].copy()
    return data, poster, tmdb_id

In [10]:
def format_movie_details_tmdb(response):
    data = response.json()
    relevant_data = {
        'id': data['external_ids']['imdb_id'],
        'primary_title': data['title'],
        'original_title': data['original_title'],
        'release_date': data['release_date'].split('-')[0],
        'runtime': data['runtime'],
        'keywords': ' '.join(keyword['name'] for keyword in data['keywords']['keywords']),
        'overview': data['overview'],
        'production_countries': ' '.join(country['name'].replace(' ', '_') 
                                         for country in data['production_countries']),
        'average_rating': data['vote_average'],
        'vote_count': data['vote_count'],
        'genres': ' '.join(genre['name'].replace(' ', '_') 
                                         for genre in data['genres']),
    }
    poster = data['poster_path']
    tmdb_id = data['id']

    df = pd.DataFrame([relevant_data])
    return df, poster, tmdb_id

In [11]:
data

Unnamed: 0,id,primary_title,original_title,release_date,runtime,keywords,overview,production_countries,average_rating,vote_count,genres,director,writers,cast
0,tt0029284,My Favorite Wife,My Favorite Wife,1940,88,jealousy judge shipwreck marriage bigamy confu...,Years after she was presumed dead in a shipwre...,United_States_of_America,7.3,11501,Comedy Romance,Garson_Kanin,Bella_Spewack Garson_Kanin Sam_Spewack John_Mc...,Cary_Grant Randolph_Scott Scotty_Beckett Donal...
1,tt0031359,Gaslight,Gaslight,1940,84,marriage murder driven mad rubies,Twenty years removed from Alice Barlow s murde...,United_Kingdom,7.3,5264,Mystery Thriller,Thorold_Dickinson,Bridget_Boland A.R._Rawlinson Patrick_Hamilton,Anton_Walbrook Frank_Pettingell Robert_Newton ...
2,tt0031976,The Stars Look Down,The Stars Look Down,1940,110,trade union miner social commentary labor union,Davey Fenwick leaves his mining village on a u...,United_Kingdom,7.0,1178,Drama,Carol_Reed,J.B._Williams A.J._Cronin A._Coppel,Michael_Redgrave Edward_Rigby Emlyn_Williams A...
3,tt0032179,21 Days Together,21 Days,1940,72,wrongful conviction,After Larry Darrent accidentally kills his lov...,United_Kingdom,6.1,1276,Crime Drama Romance,Basil_Dean,Basil_Dean Graham_Greene John_Galsworthy,Leslie_Banks Laurence_Olivier Francis_L._Sulli...
4,tt0032181,Abe Lincoln in Illinois,Abe Lincoln in Illinois,1940,110,biography historical figure abraham lincoln,Abe Lincoln in Illinois is a 1940 biographical...,United_States_of_America,7.3,2052,Biography Drama History,John_Cromwell,Grover_Jones Robert_E._Sherwood,Raymond_Massey Gene_Lockhart Minor_Watson Alan...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31959,tt9907782,The Cursed,Eight for Silver,2021,111,gypsy world war i gore flashback werewolf crea...,In the late 19th century a brutal land baron s...,United_States_of_America,6.2,16278,Fantasy Horror Mystery,Sean_Ellis,Sean_Ellis,Boyd_Holbrook Alistair_Petrie Nigel_Betts Stua...
31960,tt9908390,Le lion,Le lion,2020,95,espionage,A psychiatric hospital patient pretends to be ...,France,5.5,1407,Comedy,Ludovic_Colbeau-Justin,Alexandre_Coquelle Matthieu_Le_Naour,Dany_Boon Philippe_Katerine Samuel_Jouy Benoît...
31961,tt9911196,The Marriage Escape,De beentjes van Sint Hildegard,2020,103,,Jan has been married to Gedda for 35 years Ged...,Netherlands,7.4,3321,Comedy Drama,Johan_Nijenhuis,Herman_Finkers Maarten_Lebens Mirka_Zlatníková...,Herman_Finkers Ferdi_Stofmeel Stef_Assen Jan_R...
31962,tt9916270,Il talento del calabrone,Il talento del calabrone,2020,84,,Dj Steph is a young radio deejay on the rise w...,Italy,5.8,1480,Thriller,Giacomo_Cimini,Giacomo_Cimini Lorenzo_Collalti,Sergio_Castellitto Lorenzo_Richelmy David_Coco...


In [12]:
# def get_companies(data, separator=' '):
#     companies = []
#     for company in data:
#         if company['name']:
#             company = format_name(company['name'])
#             companies.append(company)
#     if companies:
#         return separator.join(companies)
#     else:
#         return ''


In [13]:
# def movie_recommender_weighted(movie_id, data, similarity_matrix, genre_matrix, production_matrix, top_n=3, genre_weight=0.3, production_weight=0.005):
    
#     movie_index = data.index[data['id'] == movie_id].tolist()[0]
#     similarity_scores = similarity_matrix[movie_index]
#     genre_scores = genre_matrix[movie_index]
#     production_scores = production_matrix[movie_index]

#     combined_scores = (1 - genre_weight - production_weight) * similarity_scores + genre_weight * genre_scores + production_weight * production_scores

#     indices = np.argsort(combined_scores)[::-1]
#     indices = indices[indices != movie_index]
    
#     top_indices = indices[:top_n]
#     similar_movies = data.iloc[top_indices]['title']
    
#     movies = similar_movies.tolist()
    
#     return movies


In [14]:
# def preprocess_data(data):
#     required_cols = ['genres', 'keywords', 'popularity', 'release_date', 'vote_average', 'cast', 'crew']
#     for col in required_cols:
#         if col not in data.columns:
#             raise ValueError(f"Column '{col}' is missing in the DataFrame.")

#     # Preprocess descriptions
#     descriptions = [get_description(row) for _, row in data.iterrows()]
#     vectorizer = TfidfVectorizer()
#     tfidf = vectorizer.fit_transform(descriptions)

#     # Normalize numerical features
#     scaler = MinMaxScaler()
#     popularity_norm = scaler.fit_transform(data['popularity'].values.reshape(-1, 1))
    
#     release_years = []
#     for date_str in data['release_date']:
#         if pd.isnull(date_str):
#             release_years.append(datetime.now().year)
#         else:
#             if isinstance(date_str, str):
#                 if '-' in date_str:
#                     date_str = date_str.split('-')[0]
#                 release_years.append(int(date_str))
#             elif isinstance(date_str, int):
#                 release_years.append(date_str)
#             else:
#                 raise ValueError("Invalid release date format.")
                
#     release_year_norm = scaler.fit_transform(np.array(release_years).reshape(-1, 1))
#     ratings_norm = scaler.fit_transform(data['vote_average'].values.reshape(-1, 1))

#     # Combine numerical features with TF-IDF vectors
#     combined = np.hstack((tfidf.toarray(), popularity_norm, release_year_norm, ratings_norm))
#     similarity_matrix = cosine_similarity(combined)

#     # Preprocess genres
#     genres = [get_genres(row) for _, row in data.iterrows()]
#     mlb = MultiLabelBinarizer()
#     genre_encoded = mlb.fit_transform(genres)
#     genre_matrix = cosine_similarity(genre_encoded)
#     np.fill_diagonal(genre_matrix, 0)

#     # Preprocess production details
#     productions_countries = []
#     countries = []

#     for index, row in data.iterrows():
#         production_countries =json.loads(row['production_countries'])
#         country = get_countries(production_countries)
#         countries.append(country)

#     unique_countries = set()
#     for country in countries:
#         if country:
#             unique_countries.update(country)

    
#     countries_mlb = MultiLabelBinarizer(classes=sorted(unique_countries))
#     countries_enc = countries_mlb.fit_transform(countries)
#     production_matrix = cosine_similarity(countries_enc)

#     return similarity_matrix, genre_matrix, production_matrix

In [15]:
# title = 'paddington'
# url = f"https://api.themoviedb.org/3/search/movie?query={title}&include_adult=false&language=en-US&page=1"
# res = requests.get(url, headers=headers).json()
# if res['results']:
#     release_year = res['results'][0]['release_date'].split('-')[0]
#     print('The movie entered: \n')
#     print(res['results'][0]['title'],'-',release_year)
#     print('Searching our database...')
#     movie_id = res['results'][0]['id']
#     similarity_matrix, genre_matrix, production_matrix = preprocess_data(data)
#     recommended = movie_recommender_weighted(movie_id, data, 
#                                              similarity_matrix, genre_matrix, production_matrix,
#                                              top_n=3, genre_weight=0.3, production_weight=0.2)
    
#     print(f"Movies similar to '{res['results'][0]['title']}':")
#     for title in recommended:
#         print(f"- {title}")
    

In [16]:
# firsttry = movie_recommender_weighted(data.iloc[842]['id'], data, similarity_matrix, genres_matrix)
# print(f"Movies similar to '{data.iloc[842]['title']}':")
# for title in firsttry:
#     print(f"- {title}")

In [17]:
# def movie_recommender(movie_id, data, similarity_matrix, top_n=5):
    
#     movie_index = data.index[data['id'] == movie_id].tolist()[0]
#     similarity_scores = similarity_matrix[movie_index]
    
#     # Get indices of movies sorted by similarity BUT REMOVE ITSELF
#     indices = np.argsort(similarity_scores)[::-1]
#     indices = indices[indices != movie_index]
    
#     top_indices = indices[:top_n]
#     similar_movies = data.iloc[top_indices]['title']
    
#     return similar_movies.tolist()

In [None]:
# firsttry = movie_recommender(data.iloc[842]['id'], data, similarity_matrix)
# print(f"Movies similar to '{data[data['id'] == data.iloc[842]['id']]['title'].values[0]}':")
# for title in firsttry:
#     print(f"- {title}")