In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv(r'dataset\movie.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(27278, 3)

In [5]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [6]:
movies.duplicated().sum()

0

In [7]:
movies['metadata'] = movies['title'] + ' ' + movies['genres']

In [8]:
movies.head()

Unnamed: 0,movieId,title,genres,metadata
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story (1995) Adventure|Animation|Children|...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji (1995) Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men (1995) Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale (1995) Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II (1995) Comedy


# applying tfidf vectorizer

In [9]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

In [10]:
tfidf_vectorizer

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [11]:
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['metadata'])

In [12]:
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 151382 stored elements and shape (27278, 21711)>

In [13]:
tfidf_matrix.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
tfidf_matrix.shape

(27278, 21711)

# Computing Cosine Similarities B/w Movies

In [15]:
cosine_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [16]:
cosine_sim_matrix.shape

(27278, 27278)

In [17]:
cosine_sim_matrix

array([[1.        , 0.37051575, 0.13812966, ..., 0.08370916, 0.        ,
        0.16806805],
       [0.37051575, 1.        , 0.12184768, ..., 0.08760281, 0.        ,
        0.17588558],
       [0.13812966, 0.12184768, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.08370916, 0.08760281, 0.        , ..., 1.        , 0.        ,
        0.28054861],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.16806805, 0.17588558, 0.        , ..., 0.28054861, 0.        ,
        1.        ]])

In [18]:
# function to get movies title
def get_movie_name(movie_ids):
    print(movies[movies['movieId'].isin(movie_ids)][['movieId', 'title']])


get_movie_name([1])



   movieId             title
0        1  Toy Story (1995)


In [19]:
def recommand_movie(user_history, top_n = 10):
    # Recommend movies using content-based filtering.
    user_indices = movies[movies['movieId'].isin(user_history)].index.tolist()
    print(user_indices)

    sim_score = cosine_sim_matrix[user_indices].mean(axis=0)
    
    # print(sim_score)
    # print(sim_score.shape)
    # print(cosine_sim_matrix[user_indices])
    # print(cosine_sim_matrix[user_indices].mean(axis=0))
    sorted_indices = sim_score.argsort()[::-1]
    # print(sorted_indices)

    recommanded_indices = [i for i in sorted_indices if movies.iloc[i]['movieId'] not in user_history]
    # print(recommanded_indices)

    top_recommanded_movies = movies.iloc[recommanded_indices[:top_n]]
    # print(top_recommanded_movies)

    return top_recommanded_movies[['title']]['title'].tolist()
    return top_recommanded_movies[['movieId', 'title']]



In [20]:
recommand_movie([1, 3, 12])

[0, 2, 11]


['Two Much (1995)',
 'Go Now (1995)',
 'Now and Then (1995)',
 'Toy Story 2 (1999)',
 'Toy Story 3 (2010)',
 'Show, The (1995)',
 'Over My Dead Body (1995)',
 'Toy Story of Terror (2013)',
 'It Takes Two (1995)',
 'Boys on the Side (1995)']

In [21]:
import pickle

In [22]:
pickle.dump(tfidf_vectorizer, open('CB_tfidfVectorizer.pkl', 'wb'))

In [23]:
pickle.dump(movies, open('CB_movies.pkl', 'wb'))

In [24]:
pickle.dump(cosine_sim_matrix, open('CB_cosine_sim_matrix.pkl', 'wb'))

In [25]:
import pandas as pd

In [26]:
links = pd.read_csv("dataset/link.csv")

In [27]:
pickle.dump(links, open("links.pkl", "wb"))