In [1]:
import pandas as pd
from typing import List

### Reading the Data and Initial Stats

In [4]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

Unnamed: 0,movieId,title,genres
49683,174681,"Oh, Hello: On Broadway (2017)",Comedy
25002,115622,Disaster L.A. (2014),Action|horror|SCI-FI
45614,165915,Outlaw: Kill! (1969),Action|crime|THRILLER
19414,95769,"Gospel of Judas, The (2006)",Documentary
31456,132936,Caged Women (1982),Thriller
41874,157577,Deiva Thirumagal (2011),Children|drama
39009,150874,Splinters (2009),Documentary|horror|THRILLER
16987,85401,Super (2010),Action|comedy|DRAMA
2268,2352,"Big Chill, The (1983)",Comedy|drama
12544,58432,What Just Happened (2008),Comedy|drama


In [5]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|animation|CHILDREN|comedy|Fantasy
1,2,Jumanji (1995),Adventure|children|FANTASY
2,3,Grumpier Old Men (1995),Comedy|romance
3,4,Waiting to Exhale (1995),Comedy|drama|ROMANCE
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed)
58094,193878,Les tribulations d'une caissière (2011),Comedy
58095,193880,Her Name Was Mumu (2016),Drama
58096,193882,Flora (2017),Adventure|drama|HORROR|sci-fi


In [6]:
movies_df.shape

(58098, 3)

In [7]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


## Step 1: Cleaning
### You shouldn't necessarily assume that your data is good.  It could be very sparse and not have much there. There could be duplication, poorly recorded or empty values, or with large text there could be a lot of garbage in there if it was an open text field

In [8]:
from analysis_complete.analysis.utils.cleaning import lower_case_and_strip_spaces
movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [9]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [10]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [11]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    actual = lower_case_and_strip_spaces_notebook(input)
    assert actual == expected

In [12]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [13]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [14]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [16]:
movies_cleaned_df.sample(20)

Unnamed: 0,movieId,title,genres
38295,149094,The Testaments (2000),drama
12838,60356,Two Women (2000),drama
56146,188855,Orchestra Class (2017),comedy|drama
32714,135844,Blank: A Vinylmation Love Story (2014),animation
40416,154094,984: Prisoner of the Future (1982),drama|sci-fi
39766,152701,A Pistol for Django (1971),western
54908,186199,Ice Palace (1987),drama|mystery
17944,89535,"Havre, Le (2011)",comedy|drama
29825,128934,Tomcat Angels (1991),action|comedy
50771,177053,Alive and Kicking (2017),documentary


In [17]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [18]:
movies_cleaned_df.shape

(53832, 3)

In [21]:
assert movies_cleaned_df.shape[0] < movies_df.shape[0]

In [22]:
movies_cleaned_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy
1,2,Jumanji (1995),adventure|children|fantasy
2,3,Grumpier Old Men (1995),comedy|romance
3,4,Waiting to Exhale (1995),comedy|drama|romance
4,5,Father of the Bride Part II (1995),comedy


## Checking for Duplicates
### We need to define what a "duplicate" is

In [23]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])

In [24]:
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_df

Unnamed: 0,movieId,title,genres
9142,26958,Emma (1996),romance
9157,26982,Men with Guns (1997),drama
13309,64997,War of the Worlds (2005),action|sci-fi
13395,65665,Hamlet (2000),drama
13614,67459,Chaos (2005),crime|drama|horror
...,...,...,...
56950,190881,The Boss (2016),documentary
57238,191713,Noise (2007),crime|drama|thriller
57269,191775,Berlin Calling (2008),comedy|drama
57361,192003,Journey to the Center of the Earth (2008),action|adventure|fantasy|sci-fi


In [25]:
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])

In [26]:
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
duplicated_by_title_and_genre_df

Unnamed: 0,movieId,title,genres
15902,80330,Offside (2006),comedy|drama
20835,101212,"Girl, The (2012)",drama
25046,115777,Beneath (2013),horror
27572,122940,Clear History (2013),comedy
29852,128991,Johnny Express (2014),animation|comedy|sci-fi
30226,130062,Darling (2007),drama
36172,143978,Home (2008),drama
38804,150310,Macbeth (2015),drama
44387,163246,Seven Years Bad Luck (1921),comedy
48620,172427,Little Man (2006),comedy


In [27]:
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

{'20,000 Leagues Under the Sea (1997)',
 'Absolution (2015)',
 'Aftermath (2012)',
 'Aladdin (1992)',
 'Blackout (2007)',
 'Cargo (2017)',
 'Casanova (2005)',
 'Chaos (2005)',
 'Classmates (2016)',
 'Clockstoppers (2002)',
 'Confessions of a Dangerous Mind (2002)',
 'Delirium (2014)',
 'Deranged (2012)',
 'Ecstasy (2011)',
 'Eden (2014)',
 'Emma (1996)',
 'Eros (2004)',
 'Forsaken (2016)',
 'Free Fall (2014)',
 'Frozen (2010)',
 'Good People (2014)',
 'Gossip (2000)',
 'Grace (2014)',
 'Hamlet (2000)',
 'Holiday (2014)',
 'Hostage (2005)',
 'Interrogation (2016)',
 'Journey to the Center of the Earth (2008)',
 'Lagaan: Once Upon a Time in India (2001)',
 'Men with Guns (1997)',
 'Noise (2007)',
 'Office (2015)',
 'Paradise (2013)',
 'Rose (2011)',
 'Saturn 3 (1980)',
 'Shelter (2015)',
 'Sing (2016)',
 'Slow Burn (2000)',
 'Stranded (2015)',
 'Tag (2015)',
 'The Boss (2016)',
 'The Break-In (2016)',
 'The Connection (2014)',
 'The Dream Team (2012)',
 'The Midnight Man (2016)',
 'The P

### Now we can locate an example using the titles in our list.

In [28]:
ALADDIN = 'Aladdin (1992)'

In [29]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [30]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

Unnamed: 0,movieId,title,genres
582,588,Aladdin (1992),adventure|animation|children|comedy|musical
24657,114240,Aladdin (1992),adventure|animation|children|comedy|fantasy


## Step 2: Feature Preparation

What is a feature?

A descriptive attribute that can be used in our algorithms.

Examples:
    - If we are trying to predict house prices, square footage could be a feature we use to predict the house price
    - In our case, as we try to find movies similar to our movie the "feature" we will be focusing on is the "genres" description
    - We need to "prepare" the columns data in such a way that we can compare one genre description to another and get some measure of similarity

### Let's prepare our genres list. First we need to group by movie title.

In [31]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [None]:
movies_grouped_by_title_df

In [None]:
get_aladdin_example(movies_grouped_by_title_df)

Clean up the genres lists.

In [None]:
from analysis.utils.cleaning import combine_genres_list

movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [None]:
movies_unique_genres_df.sample(20)

In [None]:
get_aladdin_example(movies_unique_genres_df)

### Let's think about our recommendation engine now again.  Let's say that we want to recommend movies to by recommending the movies with the most similar genres list.

### Collect Unique List of Genres

In [None]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()

In [None]:
for_genres_list_df.genres.unique()

In [None]:
all_genres = list(for_genres_list_df.genres.unique())

In [None]:
all_genres

In [None]:
len(all_genres)

### Let's turn our genres column into a space separated list of genres (as if they were words in a document)

In [None]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [None]:
movies_with_document_description_df

In [None]:
get_aladdin_example(movies_with_document_description_df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim

In [None]:
cosine_sim.shape

In [None]:
from analysis.utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 30)
similar_movies

### THAT'S IT! :)