In [1]:
import sys
sys.path.append("..")

import pandas as pd
from typing import List
from utils.cleaning import lower_case_and_strip_spaces
from utils.cleaning import combine_genres_list
pd.options.display.width = 0
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 1000)

### Reading the Data and Initial Stats

In [2]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

Unnamed: 0,movieId,title,genres
5593,5691,Jason Goes to Hell: The Final Friday (1993),Action|horror
48146,171455,Fabricated City (2017),Action|crime
32852,136221,Hansel & Gretel: Warriors of Witchcraft (2013),Horror|thriller
25454,116803,Battle of the Warriors (2006),Action|drama|WAR
33149,136876,The Village Had No Walls (1995),(no genres listed)
11754,51894,"Last Command, The (1928)",Drama|war
26176,118972,Don't Blink (2014),Horror|mystery|SCI-FI
23234,109416,Bring It On: Fight to the Finish (2009),Comedy
18731,92611,"Girl from Jones Beach, The (1949)",Comedy
27851,123571,Jim Jefferies: Alcoholocaust (2010),Comedy


In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|animation|CHILDREN|comedy|Fantasy
1,2,Jumanji (1995),Adventure|children|FANTASY
2,3,Grumpier Old Men (1995),Comedy|romance
3,4,Waiting to Exhale (1995),Comedy|drama|ROMANCE
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|drama|HORROR|sci-fi


In [4]:
movies_df.shape[0]

58098

# Step 1: Cleaning

### 1. Cleaning up strings
### 2. Removing unwanted rows
### 3. Checking for duplicates

In [5]:
movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [6]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [7]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [8]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    assert lower_case_and_strip_spaces_notebook(input) == expected

In [9]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [10]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [11]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [12]:
movies_cleaned_df.sample(10)

Unnamed: 0,movieId,title,genres
26695,120907,Love is Eternal While It Lasts (2004),comedy|romance
52128,180073,The Fury (2016),comedy|drama
57214,191663,Apostasy (2017),drama
2834,2919,"Year of Living Dangerously, The (1982)",drama|romance|war
56206,189013,...First Do No Harm (1997),drama
57551,192479,Avicii: True Stories (2017),documentary
36568,144832,The Beloved Vagabond (1936),(no genres listed)
19673,96771,"Chemical Brothers: Don't Think, The (2012)",musical
50588,176651,Time Toys (2017),children|sci-fi
43730,161818,VeggieTales: Duke and the Great Pie War (2005),animation|children


In [13]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [14]:
movies_cleaned_df.shape[0]

53832

In [15]:
assert movies_cleaned_df.shape[0] < movies_df.shape[0]

In [16]:
movies_cleaned_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy
1,2,Jumanji (1995),adventure|children|fantasy
2,3,Grumpier Old Men (1995),comedy|romance
3,4,Waiting to Exhale (1995),comedy|drama|romance
4,5,Father of the Bride Part II (1995),comedy


# Checking for Duplicates

### We need to define what a "duplicate" is

In [17]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])

### The movies that are duplicated by Title

In [18]:
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_df.shape

(66, 3)

### The movies that are duplicated by Title and Genre

In [19]:
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])

In [20]:
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
duplicated_by_title_and_genre_df.shape

(14, 3)

### The movies that are duplicate by Title but their Genres don't match

In [21]:
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

{'20,000 Leagues Under the Sea (1997)',
 'Absolution (2015)',
 'Aftermath (2012)',
 'Aladdin (1992)',
 'Blackout (2007)',
 'Cargo (2017)',
 'Casanova (2005)',
 'Chaos (2005)',
 'Classmates (2016)',
 'Clockstoppers (2002)',
 'Confessions of a Dangerous Mind (2002)',
 'Delirium (2014)',
 'Deranged (2012)',
 'Ecstasy (2011)',
 'Eden (2014)',
 'Emma (1996)',
 'Eros (2004)',
 'Forsaken (2016)',
 'Free Fall (2014)',
 'Frozen (2010)',
 'Good People (2014)',
 'Gossip (2000)',
 'Grace (2014)',
 'Hamlet (2000)',
 'Holiday (2014)',
 'Hostage (2005)',
 'Interrogation (2016)',
 'Journey to the Center of the Earth (2008)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Men with Guns (1997)',
 'Noise (2007)',
 'Office (2015)',
 'Paradise (2013)',
 'Rose (2011)',
 'Saturn 3 (1980)',
 'Shelter (2015)',
 'Sing (2016)',
 'Slow Burn (2000)',
 'Stranded (2015)',
 'Tag (2015)',
 'The Boss (2016)',
 'The Break-In (2016)',
 'The Connection (2014)',
 'The Dream Team (2012)',
 'The Midnight Man (2016)',
 'The P

### Now we can locate an example using the titles in our list.

In [22]:
ALADDIN = 'Aladdin (1992)'

In [23]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [24]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

Unnamed: 0,movieId,title,genres
582,588,Aladdin (1992),adventure|animation|children|comedy|musical
24657,114240,Aladdin (1992),adventure|animation|children|comedy|fantasy


# Step 2: Feature Preparation

### Feature: A descriptive attribute that can be used in our algorithms.

### Let's prepare our genres list. First we need to group by movie title.

In [25]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [26]:
movies_grouped_by_title_df.sample(10)

Unnamed: 0,title,genres
34565,President (2006),[thriller]
18292,"Good Student, The (Mr. Gibb) (2006)",[comedy]
36346,Ride in the Whirlwind (1965),[western]
36935,Royal Tramp (1992),[action|comedy]
11135,Dandelion (2004),[drama]
15479,Fear City: A Family-Style Comedy (La cité de la peur) (1994),[comedy]
46878,The Secret Rules of Modern Living: Algorithms (2015),[documentary]
43151,The Big Sick (2017),[comedy|romance]
30134,My Afternoons with Margueritte (La tête en friche) (2010),[comedy]
39769,"So Young, So Bad (1950)",[drama]


In [45]:
get_aladdin_example(movies_grouped_by_title_df)

### Clean up the genres list

In [28]:
movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [29]:
movies_unique_genres_df.sample(10)

Unnamed: 0,title,genres
22295,"Intern, The (2000)",{comedy}
15678,"Fighting Temptations, The (2003)",{drama}
35129,Puzzle (1974),"{thriller, mystery}"
53189,Yevadu (2014),"{thriller, action}"
36134,Return from the River Kwai (1989),{war}
16972,Frozen City (Valkoinen kaupunki) (2006),{drama}
16765,Freetown (2015),"{thriller, drama, action}"
16383,Forbidden (1949),"{thriller, drama}"
35414,Rage in Heaven (1941),"{thriller, drama}"
542,44 Inch Chest (2009),"{crime, drama}"


In [30]:
get_aladdin_example(movies_unique_genres_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),"{animation, comedy, fantasy, musical, children, adventure}"


### Let's turn our genres column into a space separated list of genres (as if they were words in a document)

In [31]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [32]:
movies_with_document_description_df.sample(10)

Unnamed: 0,title,genres
11314,Dark Touch (2013),horror
43569,The Circle (2014),drama documentary
32245,One Dark Night (1983),horror
1418,Abeltje (1998),adventure
29015,Minor Details (2009),mystery drama children
21086,Hurricane Season (2009),drama
45101,The Innocents (1987),drama
6289,Blondie (1938),comedy
6110,Blackie the Pirate (1971),adventure comedy action
19662,Heidi (1952),drama children


# 3. Building our Engine

### Collect Unique List of Genres

In [33]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()

In [34]:
for_genres_list_df.genres.unique()

array(['musical', 'drama', 'comedy', 'horror', 'mystery', 'thriller',
       'documentary', 'crime', 'western', 'animation', 'war', 'action',
       'fantasy', 'adventure', 'romance', 'children', 'sci-fi',
       'film-noir', 'imax'], dtype=object)

In [35]:
all_genres = list(for_genres_list_df.genres.unique())

In [36]:
all_genres

['musical',
 'drama',
 'comedy',
 'horror',
 'mystery',
 'thriller',
 'documentary',
 'crime',
 'western',
 'animation',
 'war',
 'action',
 'fantasy',
 'adventure',
 'romance',
 'children',
 'sci-fi',
 'film-noir',
 'imax']

In [37]:
len(all_genres)

19

In [38]:
get_aladdin_example(movies_with_document_description_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),animation comedy fantasy musical children adventure


In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

In [40]:
pd.DataFrame(tfidf_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,1.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.630926,0.775843,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.0,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.0,0.292446,0.000000,0.530876,0.643541,0.467441,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.000000,0.000000,1.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53761,0.0,0.000000,0.442574,0.000000,0.000000,0.000000,0.0,0.670223,0.0,0.000000,0.000000,0.0,0.0,0.0,0.595759,0.000000,0.0,0.0,0.0
53762,0.0,0.379998,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.924987,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
53763,0.0,0.000000,0.487812,0.000000,0.872949,0.000000,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
53764,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.709951,0.000000,0.0,0.0,0.0,0.000000,0.704251,0.0,0.0,0.0


In [41]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
pd.DataFrame(cosine_sim)

In [49]:
cosine_sim.shape

(53766, 53766)

In [44]:
from utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 20)
similar_movies

['Antz (1998)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Boxtrolls, The (2014)',
 'Brother Bear 2 (2006)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 "Emperor's New Groove, The (2000)",
 'Home (2015)',
 'Moana (2016)',
 'Monsters, Inc. (2001)',
 "Olaf's Frozen Adventure (2017)",
 'Penguin Highway (2018)',
 'Puss in Book: Trapped in an Epic Tale (2017)',
 'Scooby-Doo! Mask of the Blue Falcon (2012)',
 'Shrek the Third (2007)',
 'Space Jam (1996)',
 'Tale of Despereaux, The (2008)',
 'Tangled: Before Ever After (2017)',
 'The Croods 2 (2017)',
 'The Dragon Spell (2016)',
 'The Good Dinosaur (2015)']

### THAT'S IT! :)