In [1]:
import pandas as pd
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle as pk

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.tsv', sep = '\t')

In [3]:
movies = movies.drop(['originalTitle','endYear'], axis = 1)

In [4]:
cols = movies.columns

In [5]:
print(cols)

Index(['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear',
       'runtimeMinutes', 'genres'],
      dtype='object')


In [6]:
for c in cols:
    print(movies[c].unique())

['tt0000502' 'tt0000574' 'tt0000591' ... 'tt9916706' 'tt9916730'
 'tt9916754']
['movie']
['Bohemios' 'The Story of the Kelly Gang' 'The Prodigal Son' ...
 'Dankyavar Danka' '6 Gunn' 'Chico Albuquerque - Revelações']
[0 1]
['1905' '1906' '1907' '1908' '1909' '1910' '1912' '1911' '1913' '1915'
 '1914' '1919' '1916' '1917' '1936' '1925' '1918' '1920' '1922' '1921'
 '1924' '1923' '1928' '2019' '2021' '1926' '1927' '1929' '2000' '1993'
 '1935' '1930' '1942' '1932' '1931' '1934' '1939' '1937' '1933' '1950'
 '1938' '1951' '1946' '1996' '1940' '1944' '1947' '1941' '1952' '1970'
 '1957' '1943' '1948' '1945' '2001' '1949' '1953' '1954' '1965' '1983'
 '1980' '1973' '1961' '1955' '1962' '1958' '1956' '1977' '1964' '1960'
 '1959' '1967' '1968' '1963' '1971' '1969' '1972' '1966' '1976' '1990'
 '1979' '1981' '2020' '1975' '1978' '1989' '1974' '1986' '1995' '1987'
 '1985' '2018' '1984' '1982' '1988' '1991' '\\N' '1994' '1992' '2005'
 '2004' '1998' '2016' '2002' '2017' '1997' '1999' '2006' '2008' '2009

In [7]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1888
1,tt0000002,5.9,250
2,tt0000003,6.5,1677
3,tt0000004,5.8,163
4,tt0000005,6.2,2498
...,...,...,...
1253545,tt9916690,6.5,6
1253546,tt9916720,5.3,223
1253547,tt9916730,8.4,6
1253548,tt9916766,6.7,20


In [8]:
movies['genres'] = movies['genres'].replace('\\N','unknown')

In [9]:
all_genre = {}
for i, l in enumerate(movies['genres'].str.split(',')):
    if 'unknown' in l:
        if 'unknown' not in all_genre.keys():
            all_genre['unknown'] = [i]
        else:
            all_genre['unknown'].append(i)
    else:
        for c in l:
            if c not in all_genre.keys():
                all_genre[c] = [i]
            else:
                all_genre[c].append(i)

In [10]:
movies_oh = movies.copy()
movies_oh = movies_oh.reset_index()
for g in all_genre.keys():
    movies_oh[g] = [0] * len(movies_oh)
    movies_oh.loc[all_genre[g], g] = 1

In [11]:
movies_oh = movies_oh.drop(['index'],axis = 1)

In [12]:
movies_oh['runtimeMinutes'] = movies_oh['runtimeMinutes'].replace('\\N', 0)

In [13]:
movies_oh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613013 entries, 0 to 613012
Data columns (total 36 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          613013 non-null  object
 1   titleType       613013 non-null  object
 2   primaryTitle    613013 non-null  object
 3   isAdult         613013 non-null  int64 
 4   startYear       613013 non-null  object
 5   runtimeMinutes  613013 non-null  object
 6   genres          613013 non-null  object
 7   unknown         613013 non-null  int64 
 8   Action          613013 non-null  int64 
 9   Adventure       613013 non-null  int64 
 10  Biography       613013 non-null  int64 
 11  Drama           613013 non-null  int64 
 12  Fantasy         613013 non-null  int64 
 13  Comedy          613013 non-null  int64 
 14  War             613013 non-null  int64 
 15  Documentary     613013 non-null  int64 
 16  Crime           613013 non-null  int64 
 17  Romance         613013 non-nu

In [14]:
movies_oh = movies_oh.astype({'runtimeMinutes':int})

In [15]:
year_dict = {v : i for i, v in enumerate(movies_oh['startYear'].unique())}

In [16]:
movies_oh['startYear'] = movies_oh['startYear'].map(year_dict)

In [17]:
movies_oh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613013 entries, 0 to 613012
Data columns (total 36 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          613013 non-null  object
 1   titleType       613013 non-null  object
 2   primaryTitle    613013 non-null  object
 3   isAdult         613013 non-null  int64 
 4   startYear       613013 non-null  int64 
 5   runtimeMinutes  613013 non-null  int32 
 6   genres          613013 non-null  object
 7   unknown         613013 non-null  int64 
 8   Action          613013 non-null  int64 
 9   Adventure       613013 non-null  int64 
 10  Biography       613013 non-null  int64 
 11  Drama           613013 non-null  int64 
 12  Fantasy         613013 non-null  int64 
 13  Comedy          613013 non-null  int64 
 14  War             613013 non-null  int64 
 15  Documentary     613013 non-null  int64 
 16  Crime           613013 non-null  int64 
 17  Romance         613013 non-nu

In [18]:
train = movies_oh.select_dtypes(exclude = 'object')

In [19]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn.fit(train)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=100)

In [20]:
dist, n = knn.kneighbors(np.array(train.loc[424548, :]).reshape(1,-1))

In [21]:
movies_oh.loc[n[0], 'primaryTitle']

424548          Avengers: Age of Ultron
217380                        Star Trek
186071                         Iron Man
525735                         Moonfall
224548           The Amazing Spider-Man
                      ...              
294464        Liverpool vs Leeds United
280393                     Tiger Killer
235749                         Immortal
300629           Southampton vs Everton
302635    Leicester City vs Aston Villa
Name: primaryTitle, Length: 100, dtype: object

In [22]:
movies_oh_rating = pd.merge(movies_oh, ratings, 'left', on = 'tconst')

In [None]:
movies_oh_rating.fillna(0, inplace = True)

In [None]:
train_rating = movies_oh_rating.select_dtypes(exclude = 'object')

In [None]:
train_rating

In [None]:
knn2 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn2.fit(train_rating)

In [None]:
dist2, n2 = knn2.kneighbors(np.array(train_rating.loc[424548, :]).reshape(1,-1))

In [None]:
movies_oh_rating.loc[n2[0], 'primaryTitle']

In [None]:
rating_only = pd.merge(movies_oh[['tconst','primaryTitle', 'isAdult','startYear']], ratings, 'left', on = 'tconst')

In [None]:
rating_only = rating_only.fillna(0)
train_rating_only = rating_only.select_dtypes(exclude = 'object')

In [None]:
train_rating_only

In [None]:
from sklearn.neighbors import NearestNeighbors
knn3 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn3.fit(train_rating_only)

In [None]:
dist3, n3 = knn3.kneighbors(np.array(train_rating_only.loc[424548, :]).reshape(1,-1))

In [None]:
movies_oh_rating.loc[n3[0], 'primaryTitle']

In [None]:
with open('knn.pickle', 'wb') as f:
    pk.dump(knn, f)
with open('knn2.pickle', 'wb') as f:
    pk.dump(knn2,f)