In [1]:
import sys
sys.path.append("..")

import pandas as pd
from typing import List
from pydata_engine_utils.cleaning import lower_case_and_strip_spaces, combine_genres_list
pd.options.display.width = 0
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 1000)

In [2]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

Unnamed: 0,movieId,title,genres
55780,188045,В. Давыдов и Голиаф (1985),Comedy
58056,193799,Lupin the Third: Daisuke Jigen's Gravestone (2014),Action|adventure|ANIMATION|comedy
35979,143511,Human (2015),Documentary
3934,4027,"O Brother, Where Art Thou? (2000)",Adventure|comedy|CRIME
29992,129411,The Confessions of Bernhard Goetz (1987),Crime|documentary
29325,127331,The Well (1951),Drama|film-noir|THRILLER
9052,26769,Crossing the Bridge (1992),Comedy|drama
14556,72621,"Man with No Shadow, The (Homme sans ombre, L') (2004)",Animation|drama
3074,3160,Magnolia (1999),Drama
28896,126026,The Flower (1971),(no genres listed)


# Cleaning

In [13]:
movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [9]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [4]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [5]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    assert lower_case_and_strip_spaces_notebook(input) == expected

In [10]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [11]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [12]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [15]:
movies_cleaned_df.sample(5)

Unnamed: 0,movieId,title,genres
14504,72334,Our Daily Bread (2005),documentary
52710,181473,Rendel (2017),action|crime|fantasy
30304,130269,The Hollow (2004),horror
44497,163496,The Tunnel (2016),drama
54389,185067,Special Delivery (2008),comedy


In [16]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [17]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

{'20,000 Leagues Under the Sea (1997)',
 'Absolution (2015)',
 'Aftermath (2012)',
 'Aladdin (1992)',
 'Blackout (2007)',
 'Cargo (2017)',
 'Casanova (2005)',
 'Chaos (2005)',
 'Classmates (2016)',
 'Clockstoppers (2002)',
 'Confessions of a Dangerous Mind (2002)',
 'Delirium (2014)',
 'Deranged (2012)',
 'Ecstasy (2011)',
 'Eden (2014)',
 'Emma (1996)',
 'Eros (2004)',
 'Forsaken (2016)',
 'Free Fall (2014)',
 'Frozen (2010)',
 'Good People (2014)',
 'Gossip (2000)',
 'Grace (2014)',
 'Hamlet (2000)',
 'Holiday (2014)',
 'Hostage (2005)',
 'Interrogation (2016)',
 'Journey to the Center of the Earth (2008)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Men with Guns (1997)',
 'Noise (2007)',
 'Office (2015)',
 'Paradise (2013)',
 'Rose (2011)',
 'Saturn 3 (1980)',
 'Shelter (2015)',
 'Sing (2016)',
 'Slow Burn (2000)',
 'Stranded (2015)',
 'Tag (2015)',
 'The Boss (2016)',
 'The Break-In (2016)',
 'The Connection (2014)',
 'The Dream Team (2012)',
 'The Midnight Man (2016)',
 'The P

In [18]:
ALADDIN = 'Aladdin (1992)'

In [19]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [20]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

Unnamed: 0,movieId,title,genres
582,588,Aladdin (1992),adventure|animation|children|comedy|musical
24657,114240,Aladdin (1992),adventure|animation|children|comedy|fantasy


In [21]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [22]:
get_aladdin_example(movies_grouped_by_title_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),"[adventure|animation|children|comedy|musical, adventure|animation|children|comedy|fantasy]"


In [23]:
movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [24]:
get_aladdin_example(movies_unique_genres_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),"{children, comedy, adventure, musical, animation, fantasy}"


In [25]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [26]:
get_aladdin_example(movies_with_document_description_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),children comedy adventure musical animation fantasy


In [27]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()
for_genres_list_df.genres.unique()
all_genres = list(for_genres_list_df.genres.unique())

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

In [29]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [30]:
from pydata_engine_utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 20)
similar_movies

['Antz (1998)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Boxtrolls, The (2014)',
 'Brother Bear 2 (2006)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 "Emperor's New Groove, The (2000)",
 'Home (2015)',
 'Moana (2016)',
 'Monsters, Inc. (2001)',
 "Olaf's Frozen Adventure (2017)",
 'Penguin Highway (2018)',
 'Puss in Book: Trapped in an Epic Tale (2017)',
 'Scooby-Doo! Mask of the Blue Falcon (2012)',
 'Shrek the Third (2007)',
 'Space Jam (1996)',
 'Tale of Despereaux, The (2008)',
 'Tangled: Before Ever After (2017)',
 'The Croods 2 (2017)',
 'The Dragon Spell (2016)',
 'The Good Dinosaur (2015)']