In [15]:
import os
import pandas as pd
import sqlite3
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from pickle import dump
from sklearn.metrics.pairwise import cosine_similarity


In [16]:
movies = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_movies.csv')
credits = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/k-nearest-neighbors-project-tutorial/main/tmdb_5000_credits.csv')

In [17]:
# Ensure the directory exists
os.makedirs('./data', exist_ok=True)

conn = sqlite3.connect('./data/movies_database.db')

movies.to_sql('movies', conn, if_exists='replace', index=False)
credits.to_sql('credits', conn, if_exists='replace', index=False)

4803

In [18]:
# Query includes AS title_ to deal with the duplication of the title column

query = ("""
        SELECT 
         movies.title AS title_,
         *
        FROM movies
        JOIN credits ON movies.title=credits.title
        """)

data = pd.read_sql_query(query, conn)


In [19]:
data.columns.tolist()

['title_',
 'budget',
 'genres',
 'homepage',
 'id',
 'keywords',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'vote_average',
 'vote_count',
 'movie_id',
 'title',
 'cast',
 'crew']

In [20]:
# Reducing dataframe to selected columns

data = data[['title_','overview','genres','keywords','cast','crew']]

# Renaming title column

data.rename(columns={'title_':'title'}, inplace=True)

In [21]:
data.duplicated().any()

np.False_

In [22]:
def load_json_safe(json_str, default_value = None):
    try:
        return json.loads(json_str)
    except (TypeError, json.JSONDecodeError):
        return default_value
    
data["genres"] = data["genres"].apply(lambda x: [item["name"] for item in json.loads(x)] if pd.notna(x) else None)
data["keywords"] = data["keywords"].apply(lambda x: [item["name"] for item in json.loads(x)] if pd.notna(x) else None)
data["cast"] = data["cast"].apply(lambda x: [item["name"] for item in json.loads(x)][:3] if pd.notna(x) else None)
data["crew"] = data["crew"].apply(lambda x: " ".join([crew_member['name'] for crew_member in load_json_safe(x) if crew_member['job'] == 'Director']))

In [23]:
data["overview"] = data["overview"].apply(lambda x: [x])
data["overview"] = data["overview"].apply(lambda x: [str(x)])
data["genres"] = data["genres"].apply(lambda x: [str(genre) for genre in x])
data["keywords"] = data["keywords"].apply(lambda x: [str(keyword) for keyword in x])
data["cast"] = data["cast"].apply(lambda x: [str(actor) for actor in x])
data["crew"] = data["crew"].apply(lambda x: [str(crew_member) for crew_member in x])
data["tags"] = data["overview"] + data["genres"] + data["keywords"] + data["cast"] + data["crew"]
data["tags"] = data["tags"].apply(lambda x: ",".join(x).replace(",", " "))

data.drop(columns = ["genres", "keywords", "cast", "crew", "overview"], inplace = True)

In [24]:
data.head()

Unnamed: 0,title,tags
0,Avatar,['In the 22nd century a paraplegic Marine is ...
1,Pirates of the Caribbean: At World's End,['Captain Barbossa long believed to be dead ...
2,Spectre,['A cryptic message from Bond’s past sends him...
3,The Dark Knight Rises,"[""Following the death of District Attorney Har..."
4,John Carter,"[""John Carter is a war-weary former military ..."


In [25]:
# Vectorizing data and getting distances

vec_model = CountVectorizer(stop_words='english')

vec_data = vec_model.fit_transform(data['tags']).toarray()

similarity = cosine_similarity(vec_data)

In [26]:
# Initializing and fitting model

model = NearestNeighbors()

model.fit(similarity)

In [27]:
# Recommendation function

def recommend(movie):
    movie_index = data[data["title"] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse = True , key = lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(data.iloc[i[0]].title)

recommend("Pirates of the Caribbean: At World's End")

Pirates of the Caribbean: Dead Man's Chest
Pirates of the Caribbean: The Curse of the Black Pearl
Pirates of the Caribbean: On Stranger Tides
The Imaginarium of Doctor Parnassus
Cyrus
