In [None]:
import pandas as pd
from typing import List

### Reading the Data and Initial Stats

In [None]:
movies_df: pd.DataFrame = pd.read_csv('input/all_movies.csv')
movies_df.sample(20)

In [None]:
movies_df

In [None]:
movies_df.shape

In [None]:
movies_df.info()

## Step 1: Cleaning
### You shouldn't necessarily assume that your data is good.  It could be very sparse and not have much there. There could be duplication, poorly recorded or empty values, or with large text there could be a lot of garbage in there if it was an open text field

In [None]:
from analysis_complete.analysis.utils.cleaning import lower_case_and_strip_spaces

movies_cleaned_df = movies_df.copy()
movies_cleaned_df['genres'] = movies_cleaned_df['genres'].apply(lower_case_and_strip_spaces)

In [None]:
def lower_case_and_strip_spaces_notebook(input: str) -> str:
    return input.lower().strip()

In [None]:
input_1 = " comedy|FANTASY|Action "
expected_1 = "comedy|fantasy|action"

input_2 = " FANTASY "
expected_2 = "fantasy"

input_3 = "comedy"
expected_3 = "comedy"

In [None]:
def test_lower_case_and_strip_spaces(input: str, expected: str):
    actual = lower_case_and_strip_spaces_notebook(input)
    assert actual == expected

In [None]:
test_lower_case_and_strip_spaces(input_1, expected_1)

In [None]:
test_lower_case_and_strip_spaces(input_2, expected_2)

In [None]:
test_lower_case_and_strip_spaces(input_3, expected_3)

In [None]:
movies_cleaned_df.sample(20)

In [None]:
movies_cleaned_df = movies_cleaned_df.loc[movies_cleaned_df['genres'] != '(no genres listed)']

In [None]:
movies_cleaned_df.shape

In [None]:
assert movies_cleaned_df.shape[0] < movies_df.shape[0]

In [None]:
movies_cleaned_df.head(5)

## Checking for Duplicates
### We need to define what a "duplicate" is

In [None]:
duplicated_by_title = movies_cleaned_df.duplicated(['title'])

In [None]:
duplicated_by_title_df = movies_cleaned_df.loc[duplicated_by_title]
duplicated_by_title_df

In [None]:
duplicated_by_title_and_genre = movies_cleaned_df.duplicated(['title', 'genres'])

In [None]:
duplicated_by_title_and_genre_df = movies_cleaned_df.loc[duplicated_by_title_and_genre]
duplicated_by_title_and_genre_df

In [None]:
newdf=pd.concat([duplicated_by_title_and_genre_df,duplicated_by_title_df]).drop_duplicates(keep=False)
records_duplicated_by_title_only = set(newdf['title'])
records_duplicated_by_title_only

### Now we can locate an example using the titles in our list.

In [None]:
ALADDIN = 'Aladdin (1992)'

In [None]:
def get_aladdin_example(df: pd.DataFrame) -> pd.DataFrame:
    return df.loc[df['title'] == ALADDIN]

In [None]:
movies_cleaned_df.loc[movies_cleaned_df['title'] == ALADDIN]

## Step 2: Feature Preparation

What is a feature?

A descriptive attribute that can be used in our algorithms.

Examples:
    - If we are trying to predict house prices, square footage could be a feature we use to predict the house price
    - In our case, as we try to find movies similar to our movie the "feature" we will be focusing on is the "genres" description
    - We need to "prepare" the columns data in such a way that we can compare one genre description to another and get some measure of similarity

### Let's prepare our genres list. First we need to group by movie title.

In [None]:
movies_grouped_by_title_df = movies_cleaned_df.copy()
movies_grouped_by_title_df = movies_grouped_by_title_df.groupby('title').agg({'genres': lambda x: x.to_list()}).reset_index()

In [None]:
movies_grouped_by_title_df

In [None]:
get_aladdin_example(movies_grouped_by_title_df)

Clean up the genres lists.

In [None]:
from analysis.utils.cleaning import combine_genres_list

movies_unique_genres_df = movies_grouped_by_title_df.copy()
movies_unique_genres_df['genres'] = movies_unique_genres_df['genres'].apply(combine_genres_list)

In [None]:
movies_unique_genres_df.sample(20)

In [None]:
get_aladdin_example(movies_unique_genres_df)

### Let's think about our recommendation engine now again.  Let's say that we want to recommend movies to by recommending the movies with the most similar genres list.

### Collect Unique List of Genres

In [None]:
for_genres_list_df = movies_unique_genres_df.copy()
for_genres_list_df = for_genres_list_df['genres'].explode().reset_index()

In [None]:
for_genres_list_df.genres.unique()

In [None]:
all_genres = list(for_genres_list_df.genres.unique())

In [None]:
all_genres

In [None]:
len(all_genres)

### Let's turn our genres column into a space separated list of genres (as if they were words in a document)

In [None]:
movies_with_document_description_df = movies_unique_genres_df.copy()
movies_with_document_description_df['genres'] = movies_with_document_description_df['genres'].apply(lambda x: ' '.join(x))

In [None]:
movies_with_document_description_df

In [None]:
get_aladdin_example(movies_with_document_description_df)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(vocabulary=all_genres)
tfidf_matrix = tf.fit_transform(movies_with_document_description_df['genres'])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
cosine_sim

In [None]:
cosine_sim.shape

In [None]:
from analysis.utils.recommendation import get_similar_movies

similar_movies = get_similar_movies('Toy Story (1995)', cosine_sim, movies_with_document_description_df, 30)
similar_movies

### THAT'S IT! :)