In [1]:
import pandas as pd
from typing import List

### Reading the Data and Initial Stats

In [2]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

Unnamed: 0,movieId,title,genres
55616,187699,Valo (2005),(no genres listed)
33739,138326,Italians (2009),Comedy
52926,181919,Cold Storage (1951),Animation|children|COMEDY
23454,110106,Naked Harbour (Vuosaari) (2012),Drama
33201,136980,Ed Gein: The Butcher of Plainfield (2007),Crime|drama|HORROR|mystery
28921,126076,How They Get There (1997),(no genres listed)
529,533,"Shadow, The (1994)",Action|adventure|FANTASY|mystery
21383,103162,"6 Month Rule (Six Month Rule, The) (2011)",Comedy
23847,111553,Interior. Leather Bar. (2013),Drama
49734,174809,The Glove (1979),Action|adventure|THRILLER


In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|animation|CHILDREN|comedy|Fantasy
1,2,Jumanji (1995),Adventure|children|FANTASY
2,3,Grumpier Old Men (1995),Comedy|romance
3,4,Waiting to Exhale (1995),Comedy|drama|ROMANCE
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|drama|HORROR|sci-fi


In [4]:
movies_df.shape

(58098, 3)

In [5]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


## Step 1: Cleaning
You shouldn't necessarily assume that your data is good.  It could be very sparse and not have much there. There could be duplication, poorly recorded or empty values, or with large text there could be a lot of garbage in there if it was an open text field

In [18]:
from analysis_complete.analysis.utils.cleaning import lower_case_and_strip_spaces

movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [12]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [13]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [14]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    actual = lower_case_and_strip_spaces_notebook(input)
    assert actual == expected

In [15]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [16]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [17]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [19]:
movies_cleaned_df.sample(20)

Unnamed: 0,movieId,title,genres
39496,152085,Desierto (2016),drama
42644,159297,I'm the One You Want (2014),(no genres listed)
28097,124151,Vampire Dog (2012),children
23574,110459,Girls Against Boys (2012),crime|thriller
15667,79400,"Last Truck: Closing of a GM Plant, The (2009)",documentary
23392,109912,Blind Faith (1998),drama
30205,130016,"Lodger, The (1944)",crime|horror|mystery|thriller
9763,31682,"Nomi Song, The (2004)",documentary|musical
18019,89833,Bad Girl (1931),drama
40752,154844,Saturday October 5th (1969),(no genres listed)


In [20]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [21]:
movies_cleaned_df.shape

(53832, 3)

In [22]:
assert movies_cleaned_df.shape[0] < movies_df.shape[0]

In [23]:
movies_cleaned_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy
1,2,Jumanji (1995),adventure|children|fantasy
2,3,Grumpier Old Men (1995),comedy|romance
3,4,Waiting to Exhale (1995),comedy|drama|romance
4,5,Father of the Bride Part II (1995),comedy


####TODO NOTES

In [24]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])

In [25]:
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_df

Unnamed: 0,movieId,title,genres
9142,26958,Emma (1996),romance
9157,26982,Men with Guns (1997),drama
13309,64997,War of the Worlds (2005),action|sci-fi
13395,65665,Hamlet (2000),drama
13614,67459,Chaos (2005),crime|drama|horror
...,...,...,...
56950,190881,The Boss (2016),documentary
57238,191713,Noise (2007),crime|drama|thriller
57269,191775,Berlin Calling (2008),comedy|drama
57361,192003,Journey to the Center of the Earth (2008),action|adventure|fantasy|sci-fi


Get a series with True of False that indicates if the title and genre were both duplciated.

In [26]:
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])

In [27]:
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
duplicated_by_title_and_genre_df

Unnamed: 0,movieId,title,genres
15902,80330,Offside (2006),comedy|drama
20835,101212,"Girl, The (2012)",drama
25046,115777,Beneath (2013),horror
27572,122940,Clear History (2013),comedy
29852,128991,Johnny Express (2014),animation|comedy|sci-fi
30226,130062,Darling (2007),drama
36172,143978,Home (2008),drama
38804,150310,Macbeth (2015),drama
44387,163246,Seven Years Bad Luck (1921),comedy
48620,172427,Little Man (2006),comedy


In [28]:
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

{'20,000 Leagues Under the Sea (1997)',
 'Absolution (2015)',
 'Aftermath (2012)',
 'Aladdin (1992)',
 'Blackout (2007)',
 'Cargo (2017)',
 'Casanova (2005)',
 'Chaos (2005)',
 'Classmates (2016)',
 'Clockstoppers (2002)',
 'Confessions of a Dangerous Mind (2002)',
 'Delirium (2014)',
 'Deranged (2012)',
 'Ecstasy (2011)',
 'Eden (2014)',
 'Emma (1996)',
 'Eros (2004)',
 'Forsaken (2016)',
 'Free Fall (2014)',
 'Frozen (2010)',
 'Good People (2014)',
 'Gossip (2000)',
 'Grace (2014)',
 'Hamlet (2000)',
 'Holiday (2014)',
 'Hostage (2005)',
 'Interrogation (2016)',
 'Journey to the Center of the Earth (2008)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Men with Guns (1997)',
 'Noise (2007)',
 'Office (2015)',
 'Paradise (2013)',
 'Rose (2011)',
 'Saturn 3 (1980)',
 'Shelter (2015)',
 'Sing (2016)',
 'Slow Burn (2000)',
 'Stranded (2015)',
 'Tag (2015)',
 'The Boss (2016)',
 'The Break-In (2016)',
 'The Connection (2014)',
 'The Dream Team (2012)',
 'The Midnight Man (2016)',
 'The P

Now we can locate an example using the titles in our list.

In [29]:
ALADDIN = 'Aladdin (1992)'

In [30]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [31]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

Unnamed: 0,movieId,title,genres
582,588,Aladdin (1992),adventure|animation|children|comedy|musical
24657,114240,Aladdin (1992),adventure|animation|children|comedy|fantasy


## Step 2: Feature Preparation

What is a feature?

A descriptive attribute that can be used in our algorithms.

Examples:
    - If we are trying to predict house prices, square footage could be a feature we use to predict the house price
    - In our case, as we try to find movies similar to our movie the "feature" we will be focusing on is the "genres" description
    - We need to "prepare" the columns data in such a way that we can compare one genre description to another and get some measure of similarity

Let's prepare our genres list. First we need to group by movie title.

In [32]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [33]:
movies_grouped_by_title_df

Unnamed: 0,title,genres
0,"""Great Performances"" Cats (1998)",[musical]
1,#1 Cheerleader Camp (2010),[comedy|drama]
2,#Captured (2017),[horror]
3,#Horror (2015),[drama|horror|mystery|thriller]
4,#SCREAMERS (2016),[horror]
...,...,...
53761,ארבינקא (1967),[comedy|crime|romance]
53762,…And the Fifth Horseman Is Fear (1965),[drama|war]
53763,キサラギ (2007),[comedy|mystery]
53764,チェブラーシカ (2010),[animation|children]


In [25]:
get_aladdin_example(movies_grouped_by_title_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),"[adventure|animation|children|comedy|musical, ..."


Clean up the genres lists.

In [34]:
from analysis.utils.cleaning import combine_genres_list

movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [37]:
movies_unique_genres_df.sample(20)

Unnamed: 0,title,genres
0,"""Great Performances"" Cats (1998)",{musical}
1,#1 Cheerleader Camp (2010),"{comedy, drama}"
2,#Captured (2017),{horror}
3,#Horror (2015),"{mystery, thriller, horror, drama}"
4,#SCREAMERS (2016),{horror}
...,...,...
53761,ארבינקא (1967),"{romance, comedy, crime}"
53762,…And the Fifth Horseman Is Fear (1965),"{war, drama}"
53763,キサラギ (2007),"{mystery, comedy}"
53764,チェブラーシカ (2010),"{animation, children}"


In [29]:
get_aladdin_example(movies_unique_genres_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),"{children, musical, comedy, animation, adventu..."


Let's think about our recommendation engine now again.  Let's say that we want to recommend movies to by recommending the movies with the most similar genres list.

In order to to use TF IDF we need a list of all the "words" (genres) used in our corpus.  This is easy for us to do.  We can make a list of all the genres by:
1. Creating a column with a list of genres
2. Grouping by the genre
3. Aggregating the results
4. Transforming the resulting series into a list of genres

Collect Unique List of Genres

In [1]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()

NameError: name 'movies_unique_genres_df' is not defined

In [35]:
for_genres_list_df.genres.unique()

array(['musical', 'drama', 'comedy', 'horror', 'thriller', 'mystery',
       'documentary', 'crime', 'western', 'animation', 'war', 'action',
       'adventure', 'fantasy', 'romance', 'children', 'sci-fi',
       'film-noir', 'imax'], dtype=object)

In [36]:
all_genres = list(for_genres_list_df.genres.unique())

In [37]:
len(all_genres)

19

Let's turn our genres column into a space separated list of genres (as if they were words in a document)

In [38]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [39]:
movies_with_document_description_df

Unnamed: 0,title,genres
0,"""Great Performances"" Cats (1998)",musical
1,#1 Cheerleader Camp (2010),drama comedy
2,#Captured (2017),horror
3,#Horror (2015),horror thriller drama mystery
4,#SCREAMERS (2016),horror
...,...,...
53761,ארבינקא (1967),romance crime comedy
53762,…And the Fifth Horseman Is Fear (1965),war drama
53763,キサラギ (2007),mystery comedy
53764,チェブラーシカ (2010),children animation


In [40]:
get_aladdin_example(movies_with_document_description_df)

Unnamed: 0,title,genres
2023,Aladdin (1992),children musical comedy animation adventure fa...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

NameError: name 'all_genres' is not defined

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [44]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.37846524, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.37846524, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.        , 0.        ,
        1.        ]])

In [45]:
cosine_sim.shape

(53766, 53766)

In [46]:
from analysis.utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 30)
similar_movies

['Antz (1998)',
 'Asterix and the Vikings (Astérix et les Vikings) (2006)',
 'Boxtrolls, The (2014)',
 'Brother Bear 2 (2006)',
 'DuckTales: The Movie - Treasure of the Lost Lamp (1990)',
 "Emperor's New Groove, The (2000)",
 'Home (2015)',
 'Moana (2016)',
 'Monsters, Inc. (2001)',
 "Olaf's Frozen Adventure (2017)",
 'Penguin Highway (2018)',
 'Puss in Book: Trapped in an Epic Tale (2017)',
 'Scooby-Doo! Mask of the Blue Falcon (2012)',
 'Shrek the Third (2007)',
 'Space Jam (1996)',
 'Tale of Despereaux, The (2008)',
 'Tangled: Before Ever After (2017)',
 'The Croods 2 (2017)',
 'The Dragon Spell (2016)',
 'The Good Dinosaur (2015)',
 'The Magic Crystal (2011)',
 'Toy Story (1995)',
 'Toy Story 2 (1999)',
 'Toy Story Toons: Hawaiian Vacation (2011)',
 'Toy Story Toons: Small Fry (2011)',
 'Trolls Holiday (2017)',
 'Turbo (2013)',
 'Wild, The (2006)',
 'Inside Out (2015)',
 'Pokémon the Movie: I Choose You! (2017)']

THAT'S IT! :)