In [2]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [3]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [33]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [25]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [36]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
print(X_train_counts)

  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 8)	1
  (1, 1)	1
  (1, 3)	1
  (1, 8)	1
  (2, 4)	1
  (2, 15)	1
  (3, 4)	1
  (3, 15)	1
  (3, 7)	1
  (4, 4)	1
  (5, 0)	1
  (5, 5)	1
  (5, 17)	1
  (6, 4)	1
  (6, 15)	1
  (7, 1)	1
  (7, 3)	1
  (8, 0)	1
  (9, 1)	1
  (9, 0)	1
  (9, 17)	1
  :	:
  (9731, 1)	1
  (9731, 4)	1
  (9731, 0)	1
  (9731, 16)	1
  (9732, 2)	1
  (9732, 4)	1
  (9732, 0)	1
  (9732, 16)	1
  (9733, 2)	1
  (9733, 7)	1
  (9734, 4)	1
  (9734, 7)	1
  (9735, 2)	1
  (9736, 6)	1
  (9737, 2)	1
  (9737, 4)	1
  (9737, 8)	1
  (9737, 0)	1
  (9738, 2)	1
  (9738, 4)	1
  (9738, 8)	1
  (9739, 7)	1
  (9740, 2)	1
  (9740, 0)	1
  (9741, 4)	1


In [35]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf)

  (0, 8)	0.482990142708577
  (0, 4)	0.26758647689140014
  (0, 3)	0.5048454681396087
  (0, 2)	0.5162254711770092
  (0, 1)	0.41684567364693936
  (1, 8)	0.5936619434123594
  (1, 3)	0.620525172745643
  (1, 1)	0.5123612074824268
  (2, 15)	0.8210088907493954
  (2, 4)	0.5709154064399099
  (3, 15)	0.726240982959826
  (3, 7)	0.46640480307738325
  (3, 4)	0.5050154397005037
  (4, 4)	1.0
  (5, 17)	0.5420423542868653
  (5, 5)	0.6359470441562756
  (5, 0)	0.5493281743985542
  (6, 15)	0.8210088907493954
  (6, 4)	0.5709154064399099
  (7, 3)	0.7711121633813997
  (7, 1)	0.6366993258087036
  (8, 0)	1.0
  (9, 17)	0.5457299419583337
  (9, 1)	0.6295217016667961
  (9, 0)	0.5530653284926608
  :	:
  (9731, 16)	0.5912169125932073
  (9731, 4)	0.35033318329028923
  (9731, 1)	0.5457483258721834
  (9731, 0)	0.47946635727353976
  (9732, 16)	0.5491808953132112
  (9732, 4)	0.32542420076143624
  (9732, 2)	0.6278054979536716
  (9732, 0)	0.4453758980017919
  (9733, 7)	0.43179372819853856
  (9733, 2)	0.9019723811117537
  (

In [9]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [10]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [11]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]], dtype=int64))

In [21]:
res[1][0]

array([6774, 9096, 5636, 6723, 3376, 7496, 9717], dtype=int64)

In [23]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
6723,58972,Nim's Island (2008),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
7496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
9717,188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [13]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [14]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [15]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [16]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [17]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0


In [18]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [19]:
movies_with_tags.tag.unique().shape

(1590,)

In [20]:
movies_with_tags.dropna(inplace=True)

In [21]:
movies_with_tags.title.unique().shape

(1572,)

In [22]:
tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [23]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [24]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [25]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [26]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='manhattan',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [27]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

822


In [28]:
tag_strings[822]

'L.A.'

In [41]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [42]:
res

(array([[0., 0., 1., 1., 1., 1., 1., 1., 1., 1.]]),
 array([[661, 822, 947, 955, 954, 953, 951, 950, 949, 959]]))

In [43]:
for i in res[1][0]:
    print(movies[i])

In a Lonely Place (1950)
Magnolia (1999)
Neon Genesis Evangelion: Death & Rebirth (Shin seiki Evangelion Gekijô-ban: Shito shinsei) (1997)
Night and Day (1946)
Nicholas Nickleby (2002)
Niagara (1953)
Never Been Kissed (1999)
Network (1976)
Net, The (1995)
Night of the Hunter, The (1955)
