# Build a recommendation system

## Content-based filtering

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv("../datasets/movie_data/movies.csv",sep='\t', encoding='latin-1',usecols=['title','genres'])
data.head()

Unnamed: 0,title,genres
0,Toy Story (1995),Animation|Children's|Comedy
1,Jumanji (1995),Adventure|Children's|Fantasy
2,Grumpier Old Men (1995),Comedy|Romance
3,Waiting to Exhale (1995),Comedy|Drama
4,Father of the Bride Part II (1995),Comedy


## Preprocessing

In [3]:
# Genres
# data['genres'] = data['genres'].str.replace('|', ' ')
data['genres'] = data['genres'].apply(lambda genres: genres.replace('|',' ').replace('-',''))


In [4]:
data['genres'].head()

0     Animation Children's Comedy
1    Adventure Children's Fantasy
2                  Comedy Romance
3                    Comedy Drama
4                          Comedy
Name: genres, dtype: object

In [5]:
data['genres'].nunique()

301

In [6]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['genres'])


In [7]:
print(tfidf_matrix.shape)
print(tfidf_matrix.toarray()[0])
print(vectorizer.vocabulary_)

(3883, 18)
[0.         0.         0.72890105 0.59171433 0.34435072 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]
{'animation': 2, 'children': 3, 'comedy': 4, 'adventure': 1, 'fantasy': 8, 'romance': 13, 'drama': 7, 'action': 0, 'crime': 5, 'thriller': 15, 'horror': 10, 'scifi': 14, 'documentary': 6, 'war': 16, 'musical': 11, 'mystery': 12, 'filmnoir': 9, 'western': 17}


In [8]:
# Convert to tfidf matrix to dataframe
tfidf_matrix_df = pd.DataFrame(tfidf_matrix.todense(), columns=vectorizer.get_feature_names_out(),index=data['title'])
tfidf_matrix_df

Unnamed: 0_level_0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,musical,mystery,romance,scifi,thriller,war,western
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Toy Story (1995),0.0,0.000000,0.728901,0.591714,0.344351,0.0,0.0,0.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Jumanji (1995),0.0,0.499814,0.000000,0.516339,0.000000,0.0,0.0,0.000000,0.6954,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Grumpier Old Men (1995),0.0,0.000000,0.000000,0.000000,0.573172,0.0,0.0,0.000000,0.0000,0.0,0.0,0.0,0.0,0.819435,0.0,0.000000,0.0,0.0
Waiting to Exhale (1995),0.0,0.000000,0.000000,0.000000,0.755606,0.0,0.0,0.655026,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Father of the Bride Part II (1995),0.0,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.0,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Requiem for a Dream (2000),0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Tigerland (2000),0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0
Two Family House (2000),0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.0000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0


In [None]:
consine_matrix = cosine_similarity(tfidf_matrix)
consine_matrix_df = pd.DataFrame(consine_matrix, columns=data.title, index=data['title'])
consine_matrix_df

title,Toy Story (1995),Jumanji (1995),Grumpier Old Men (1995),Waiting to Exhale (1995),Father of the Bride Part II (1995),Heat (1995),Sabrina (1995),Tom and Huck (1995),Sudden Death (1995),GoldenEye (1995),...,Bamboozled (2000),Bootmen (2000),Digimon: The Movie (2000),Get Carter (2000),Get Carter (1971),Meet the Parents (2000),Requiem for a Dream (2000),Tigerland (2000),Two Family House (2000),"Contender, The (2000)"
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Toy Story (1995),1.000000,0.305525,0.197372,0.260194,0.344351,0.000000,0.197372,0.425153,0.0,0.000000,...,0.344351,0.260194,0.801460,0.000000,0.000000,0.344351,0.000000,0.000000,0.000000,0.000000
Jumanji (1995),0.305525,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.718623,0.0,0.320890,...,0.000000,0.000000,0.538118,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
Grumpier Old Men (1995),0.197372,0.000000,1.000000,0.433093,0.573172,0.000000,1.000000,0.000000,0.0,0.000000,...,0.573172,0.433093,0.000000,0.000000,0.000000,0.573172,0.000000,0.000000,0.000000,0.000000
Waiting to Exhale (1995),0.260194,0.000000,0.433093,1.000000,0.755606,0.000000,0.433093,0.000000,0.0,0.000000,...,0.755606,1.000000,0.000000,0.262005,0.000000,0.755606,0.655026,0.655026,0.655026,0.343133
Father of the Bride Part II (1995),0.344351,0.000000,0.573172,0.755606,1.000000,0.000000,0.573172,0.000000,0.0,0.000000,...,1.000000,0.755606,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Meet the Parents (2000),0.344351,0.000000,0.573172,0.755606,1.000000,0.000000,0.573172,0.000000,0.0,0.000000,...,1.000000,0.755606,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Requiem for a Dream (2000),0.000000,0.000000,0.000000,0.655026,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.655026,0.000000,0.399991,0.000000,0.000000,1.000000,1.000000,1.000000,0.523847
Tigerland (2000),0.000000,0.000000,0.000000,0.655026,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.655026,0.000000,0.399991,0.000000,0.000000,1.000000,1.000000,1.000000,0.523847
Two Family House (2000),0.000000,0.000000,0.000000,0.655026,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.655026,0.000000,0.399991,0.000000,0.000000,1.000000,1.000000,1.000000,0.523847


## Tips:
- Lưu trữ ma trận cosine similarity để mỗi khi gợi ý chỉ cần gọi ra, không cần tính lại

In [29]:
input_movie = "GoldenEye (1995)"
top_k = 20
# Lấy những cosine similarity lớn nhất của bô phim
result = consine_matrix_df[input_movie].sort_values(ascending=False).head(top_k)
result

title
Maximum Risk (1996)                      1.000000
Perfect Storm, The (2000)                1.000000
Anaconda (1997)                          1.000000
Surviving the Game (1994)                1.000000
Firestorm (1998)                         1.000000
Daylight (1996)                          1.000000
Con Air (1997)                           1.000000
Chain Reaction (1996)                    1.000000
GoldenEye (1995)                         1.000000
Rock, The (1996)                         1.000000
Clear and Present Danger (1994)          1.000000
Runaway Train (1985)                     0.948322
Bird on a Wire (1990)                    0.875544
Twister (1996)                           0.875544
Mummy, The (1999)                        0.854468
Deliverance (1972)                       0.841558
Edge, The (1997)                         0.841558
Escape from New York (1981)              0.839803
Lost World: Jurassic Park, The (1997)    0.839803
Abyss, The (1989)                        0.8

In [17]:
consine_matrix_df.loc[input_movie,:].shape

(3883,)