In [1]:
import sys
sys.path.append("..")

import pandas as pd
from typing import List
from pydata_engine_utils.cleaning import lower_case_and_strip_spaces, combine_genres_list
pd.options.display.width = 0
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 1000)

In [2]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

Unnamed: 0.1,Unnamed: 0,movieId,title,genres
481,481,722,"Haunted World of Edward D. Wood Jr., The (1996)",Documentary
14102,14102,99691,Odette Toulemonde (2006),Comedy|drama
35075,35075,178595,J'attends quelqu'un (2007),Comedy|drama
19525,19525,124681,Raffles (1939),Adventure|comedy|CRIME|drama|Romance|THRILLER
36635,36635,183335,The Mysterious Castle in the Carpathians (1981),Adventure|comedy|FANTASY
39254,39254,192273,Grace Jones: Bloodlight and Bami (2017),Documentary
23007,23007,137690,A Little Pond (2010),Action|drama|THRILLER
26422,26422,149576,Mistletoe Over Manhattan (2011),(no genres listed)
1364,1364,2105,Tron (1982),Action|adventure|SCI-FI
12645,12645,91163,Moonlight and Cactus (1944),Comedy|musical|WESTERN


In [3]:
movies_df.shape

(39677, 4)

# Cleaning

In [4]:
movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [5]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [6]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [7]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    assert lower_case_and_strip_spaces_notebook(input) == expected

In [8]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [9]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [10]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [11]:
movies_cleaned_df.sample(5)

Unnamed: 0.1,Unnamed: 0,movieId,title,genres
19377,19377,124207,Dancing Co-Ed (1939),comedy|romance
4405,4405,6841,Article 99 (1992),comedy|drama
22729,22729,136742,Un'estate al mare (2008),(no genres listed)
12271,12271,88788,Sins of My Father (2009),documentary
20902,20902,130454,Michael Laudrup - en Fodboldspiller,(no genres listed)


In [12]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [13]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

{'20,000 Leagues Under the Sea (1997)',
 'Absolution (2015)',
 'Aftermath (2012)',
 'Aladdin (1992)',
 'Casanova (2005)',
 'Chaos (2005)',
 'Classmates (2016)',
 'Clockstoppers (2002)',
 'Confessions of a Dangerous Mind (2002)',
 'Ecstasy (2011)',
 'Frozen (2010)',
 'Gossip (2000)',
 'Hostage (2005)',
 'Interrogation (2016)',
 'Journey to the Center of the Earth (2008)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Noise (2007)',
 'Rose (2011)',
 'Saturn 3 (1980)',
 'Slow Burn (2000)',
 'The Connection (2014)',
 'The Dream Team (2012)',
 'The Promise (2016)',
 'War of the Worlds (2005)',
 'Weekend (2011)'}

In [14]:
ALADDIN = 'Aladdin (1992)'

In [15]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [16]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

Unnamed: 0.1,Unnamed: 0,movieId,title,genres
404,404,588,Aladdin (1992),adventure|animation|children|comedy|musical
16997,16997,114240,Aladdin (1992),adventure|animation|children|comedy|fantasy


In [17]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [18]:
get_aladdin_example(movies_grouped_by_title_df)

Unnamed: 0,title,genres
1322,Aladdin (1992),"[adventure|animation|children|comedy|musical, adventure|animation|children|comedy|fantasy]"


In [19]:
movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [20]:
get_aladdin_example(movies_unique_genres_df)

Unnamed: 0,title,genres
1322,Aladdin (1992),"{musical, children, adventure, comedy, animation, fantasy}"


In [21]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [22]:
get_aladdin_example(movies_with_document_description_df)

Unnamed: 0,title,genres
1322,Aladdin (1992),musical children adventure comedy animation fantasy


In [23]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()
for_genres_list_df.genres.unique()
all_genres = list(for_genres_list_df.genres.unique())

In [24]:
movies_with_document_description_df.shape

(35383, 2)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [27]:
from pydata_engine_utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 20)
similar_movies

['Antz (1998)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Boxtrolls, The (2014)',
 'Brother Bear 2 (2006)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 "Emperor's New Groove, The (2000)",
 'Home (2015)',
 'Moana (2016)',
 'Monsters, Inc. (2001)',
 "Olaf's Frozen Adventure (2017)",
 'Penguin Highway (2018)',
 'Puss in Book: Trapped in an Epic Tale (2017)',
 'Scooby-Doo! Mask of the Blue Falcon (2012)',
 'Shrek the Third (2007)',
 'Space Jam (1996)',
 'Tale of Despereaux, The (2008)',
 'Tangled: Before Ever After (2017)',
 'The Croods 2 (2017)',
 'The Dragon Spell (2016)',
 'The Good Dinosaur (2015)']