### Basic Overview of the Dataset

In [2]:
# Import necessary libraries
import pandas as pd
import pickle

In [3]:
# Read the dataset
movies_df = pd.read_csv('top10K-TMDB-movies.csv')
movies_df

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811
...,...,...,...,...,...,...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy",en,"The story follows the adventures of Aang, a yo...",98.322,2010-06-30,4.7,3347
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventure",en,The sharks take bite out of the East Coast whe...,12.490,2015-07-22,4.7,417
9997,13995,Captain America,"Action,Science Fiction,War",en,"During World War II, a brave, patriotic Americ...",18.333,1990-12-14,4.6,332
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama",en,A man named Farmer sets out to rescue his kidn...,15.159,2007-11-29,4.7,668


In [4]:
# Get the shape of the dataset
movies_df.shape

(10000, 9)

In [5]:
# Get all column names
list(movies_df.columns)

['id',
 'title',
 'genre',
 'original_language',
 'overview',
 'popularity',
 'release_date',
 'vote_average',
 'vote_count']

In [6]:
"""
Make a new pandas dataframe taking two columns: 'genre' and 'overview' which
will be used to similarize movies.
"""

tags = movies_df['genre'] + movies_df['overview']
filtered_df = movies_df[['id', 'title']]
filtered_df['tags'] = tags
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['tags'] = tags


Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."


### Feature Extraction

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize a CountVectorizer object
vectorizer = CountVectorizer(
    max_features=10000,  # Maximum number of features (words) to consider
    stop_words='english'  # Language-specific stopwords to be removed from the text
)

# Transform the text data into a bag-of-words representation
vectorized = vectorizer.fit_transform(filtered_df['tags'].values.astype('U')).toarray()

# Return the vectorized text data
vectorized


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Cosine Similarity

In [13]:
# Importing the cosine_similarity function from the sklearn.metrics.pairwise module
from sklearn.metrics.pairwise import cosine_similarity

# Computing the cosine similarity matrix for the given vectors
similarity = cosine_similarity(vectorized)

# The 'similarity' variable now holds a matrix where each element (i, j) represents the cosine similarity 
# between the ith and jth vectors in the 'vectorized' matrix.
similarity


array([[1.        , 0.06253054, 0.05802589, ..., 0.07963978, 0.07597372,
        0.03798686],
       [0.06253054, 1.        , 0.08980265, ..., 0.        , 0.        ,
        0.        ],
       [0.05802589, 0.08980265, 1.        , ..., 0.02541643, 0.03636965,
        0.        ],
       ...,
       [0.07963978, 0.        , 0.02541643, ..., 1.        , 0.03327792,
        0.03327792],
       [0.07597372, 0.        , 0.03636965, ..., 0.03327792, 1.        ,
        0.04761905],
       [0.03798686, 0.        , 0.        , ..., 0.03327792, 0.04761905,
        1.        ]])

In [8]:
# Pick a row with a title 'The Godfather'
filtered_df[filtered_df['title'] == 'The Godfather']

Unnamed: 0,id,title,tags
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."


In [15]:
# Get the sorted cosine distance for the row
distance = sorted(list(enumerate(similarity[2])), reverse=True, key=lambda vector:vector[1])

# Print the titles of the top 5 most similar movies (excluding the input movie itself)
for i in distance[1:6]:
    print(filtered_df.iloc[i[0]].title)

The Godfather: Part II
Felon
House of Gucci
Gotti
The Outsider


In [11]:
def recommend(movies):
    """
    Recommends similar movies based on the input movie.

    Parameters:
    movies (str): The title of the movie for which recommendations are requested.

    Returns:
    None: Prints the titles of similar movies.
    """
    # Find the index of the input movie in the DataFrame
    index = filtered_df[filtered_df['title'] == movies].index[0]
    
    # Calculate the cosine similarity of the input movie with all other movies
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda vector:vector[1])
    
    # Print the titles of the top 5 most similar movies (excluding the input movie itself)
    for i in distance[1:6]:
        print(filtered_df.iloc[i[0]].title)


In [12]:
# Testing the function
recommend('Iron Man')

Mazinger Z: Infinity
Justice League Dark
Iron Man 3
The Colony
Marvel One-Shot: Item 47


### Save the object using Pickle

In [13]:
# Save the DataFrame containing movie data to a pickle file
pickle.dump(filtered_df, open('movie_data.pkl', 'wb'))

# Save the cosine similarity matrix to a pickle file
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [14]:
# Load the 'movie_data' pickle file
pickle.load(open('movie_data.pkl', 'rb'))

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,CrimeIn the continuing saga of the Corle..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,FantasyThe story follows the ..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,WarDuring World War II,..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,DramaA man named Farm..."
