### Import library

In [1]:
import pandas as pd
import numpy as np

### Read data

In [2]:
data = pd.read_csv('content_base_data.csv')

In [3]:
data.head()

Unnamed: 0,movieId,title,genres,poster_url,overview,keyword
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...,"Led by Woody, Andy's toys live happily in his ...",toy stori toy stori toy stori adventur anim ch...
1,2,Jumanji (1995),Adventure Children Fantasy,https://image.tmdb.org/t/p/w500//6aGn2X51bahFo...,When siblings Judy and Peter discover an encha...,jumanji jumanji jumanji adventur children fant...
2,3,Grumpier Old Men (1995),Comedy Romance,https://image.tmdb.org/t/p/w500//1FSXpj5e8l4KH...,A family wedding reignites the ancient feud be...,grumpier old men grumpier old men grumpier old...
3,4,Waiting to Exhale (1995),Comedy Drama Romance,https://image.tmdb.org/t/p/w500//4uw6HKq4vlhrS...,"Cheated on, mistreated and stepped on, the wom...",wait to exhal wait to exhal wait to exhal come...
4,5,Father of the Bride Part II (1995),Comedy,https://image.tmdb.org/t/p/w500//rj4LBtwQ0uGrp...,Just when George Banks has recovered from his ...,father of the bride part ii father of the brid...


In [4]:
data['keyword'] = data['keyword'].apply(lambda x: str(x))

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [126]:
vectors = cv.fit_transform(data['keyword']).toarray()
vectors.shape

(9742, 5000)

### get 5000 keyword common using to make feature names and store it into dictionary.csv file 

In [135]:
dictionary = cv.get_feature_names()
dictionary = pd.DataFrame(dictionary, columns=['dictionary'])
dictionary.to_csv('dictionary.csv', index=False)

In [134]:
dictionary

['000',
 '10',
 '100',
 '101',
 '10th',
 '11',
 '12',
 '13',
 '13th',
 '14',
 '15',
 '16',
 '16th',
 '17',
 '18',
 '18th',
 '19',
 '1900',
 '1920',
 '1930',
 '1930s',
 '1940',
 '1944',
 '1945',
 '1950',
 '1950s',
 '1960',
 '1960s',
 '1962',
 '1965',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1975',
 '1976',
 '1979',
 '1980',
 '1980s',
 '1984',
 '1985',
 '1986',
 '1990',
 '1992',
 '1994',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '20th',
 '21',
 '21st',
 '23',
 '24',
 '25',
 '25th',
 '27',
 '28',
 '30',
 '300',
 '3000',
 '35',
 '39',
 '3d',
 '40',
 '42',
 '47',
 '48',
 '50',
 '500',
 '60',
 '6th',
 '70',
 '80',
 '90',
 'aaron',
 'abandon',
 'abbott',
 'abduct',
 'abil',
 'abl',
 'aboard',
 'abomin',
 'abort',
 'abov',
 'abroad',
 'absence',
 'absolut',
 'absorb',
 'absurd',
 'abus',
 'academi',
 'academy',
 'accept',
 'access',
 'accid',
 'accident',
 'acclaim',
 'accompani',
 'accomplish',
 'accord',
 'account',
 'accus',
 'ace',
 'achiev',
 'acquaint',
 'acquir

In [127]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Store feature vector into matrix.npz file

In [128]:
from numpy import asarray
from numpy import savez_compressed
savez_compressed('matrix.npz', vectors)

### Load data from matrix.npz file

In [7]:
from numpy import load
dict_data = load('matrix.npz')
data = dict_data['arr_0']

In [9]:
data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [136]:
from sklearn.metrics.pairwise import cosine_similarity

###  calculator similar of all vectors just make in above using cosinse_similarity function

In [137]:
similarity = cosine_similarity(vector)

In [138]:
similarity.shape

(9742, 9742)

In [139]:
np.savez_compressed('similarity', similarity)

###  calculate  distance of movie with id=1 to other movies (get 10 movies have most similar)

In [145]:
most_similars = sorted(list(enumerate(similarity[data.loc[data['movieId']==1].index[0]])), reverse=True, key=lambda x:x[1])[0:10]

In [146]:
result = []
for similar in most_similars:
    result.append(data.loc[similar[0]])

In [147]:
for i in result:
    print(i)

movieId                                                       1
title                                          Toy Story (1995)
genres              Adventure Animation Children Comedy Fantasy
poster_url    https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...
overview      Led by Woody, Andy's toys live happily in his ...
keyword       toy stori toy stori toy stori adventur anim ch...
Name: 0, dtype: object
movieId                                                    3114
title                                        Toy Story 2 (1999)
genres              Adventure Animation Children Comedy Fantasy
poster_url    https://image.tmdb.org/t/p/w500//eVGu0zsezaSCu...
overview      Andy heads off to Cowboy Camp, leaving his toy...
keyword       toy stori 2 toy stori 2 toy stori 2 adventur a...
Name: 2355, dtype: object
movieId                                                   78499
title                                        Toy Story 3 (2010)
genres         Adventure Animation Children Comedy Fant