In [1]:
import pandas as pd
import re
import random
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pickle as pk

In [2]:
movies = pd.read_csv('movies.csv', encoding = 'latin-1')
ratings = pd.read_csv('ratings.tsv', sep = '\t')

In [3]:
movies = movies.drop(['originalTitle','endYear'], axis = 1)

In [4]:
cols = movies.columns

In [5]:
print(cols)

Index(['tconst', 'titleType', 'primaryTitle', 'isAdult', 'startYear',
       'runtimeMinutes', 'genres'],
      dtype='object')


In [6]:
for c in cols:
    print(movies[c].unique())

['tt0000502' 'tt0000574' 'tt0000591' ... 'tt9916706' 'tt9916730'
 'tt9916754']
['movie']
['Bohemios' 'The Story of the Kelly Gang' 'The Prodigal Son' ...
 'Dankyavar Danka' '6 Gunn' 'Chico Albuquerque - Revelações']
[0 1]
['1905' '1906' '1907' '1908' '1909' '1910' '1912' '1911' '1913' '1915'
 '1914' '1919' '1916' '1917' '1936' '1925' '1918' '1920' '1922' '1921'
 '1924' '1923' '1928' '2019' '2021' '1926' '1927' '1929' '2000' '1993'
 '1935' '1930' '1942' '1932' '1931' '1934' '1939' '1937' '1933' '1950'
 '1938' '1951' '1946' '1996' '1940' '1944' '1947' '1941' '1952' '1970'
 '1957' '1943' '1948' '1945' '2001' '1949' '1953' '1954' '1965' '1983'
 '1980' '1973' '1961' '1955' '1962' '1958' '1956' '1977' '1964' '1960'
 '1959' '1967' '1968' '1963' '1971' '1969' '1972' '1966' '1976' '1990'
 '1979' '1981' '2020' '1975' '1978' '1989' '1974' '1986' '1995' '1987'
 '1985' '2018' '1984' '1982' '1988' '1991' '\\N' '1994' '1992' '2005'
 '2004' '1998' '2016' '2002' '2017' '1997' '1999' '2006' '2008' '2009

In [7]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1888
1,tt0000002,5.9,250
2,tt0000003,6.5,1677
3,tt0000004,5.8,163
4,tt0000005,6.2,2498
...,...,...,...
1253545,tt9916690,6.5,6
1253546,tt9916720,5.3,223
1253547,tt9916730,8.4,6
1253548,tt9916766,6.7,20


In [6]:
movies['genres'] = movies['genres'].replace('\\N','unknown')

In [7]:
movies = movies.dropna()

In [8]:
all_genre = {}
for i, l in enumerate(movies['genres'].str.split(',')):
    if 'unknown' in l:
        if 'unknown' not in all_genre.keys():
            all_genre['unknown'] = [i]
        else:
            all_genre['unknown'].append(i)
    else:
        for c in l:
            if c not in all_genre.keys():
                all_genre[c] = [i]
            else:
                all_genre[c].append(i)

In [9]:
movies_oh = movies.copy()
movies_oh = movies_oh.reset_index()
for g in all_genre.keys():
    movies_oh[g] = [0] * len(movies_oh)
    movies_oh.loc[all_genre[g], g] = 1

In [10]:
movies_oh = movies_oh.drop(['index'],axis = 1)

In [11]:
movies_oh['runtimeMinutes'] = movies_oh['runtimeMinutes'].replace('\\N', 0)

In [12]:
movies_oh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613001 entries, 0 to 613000
Data columns (total 36 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          613001 non-null  object 
 1   titleType       613001 non-null  object 
 2   primaryTitle    613001 non-null  object 
 3   isAdult         613001 non-null  float64
 4   startYear       613001 non-null  object 
 5   runtimeMinutes  613001 non-null  object 
 6   genres          613001 non-null  object 
 7   unknown         613001 non-null  int64  
 8   Action          613001 non-null  int64  
 9   Adventure       613001 non-null  int64  
 10  Biography       613001 non-null  int64  
 11  Drama           613001 non-null  int64  
 12  Fantasy         613001 non-null  int64  
 13  Comedy          613001 non-null  int64  
 14  War             613001 non-null  int64  
 15  Documentary     613001 non-null  int64  
 16  Crime           613001 non-null  int64  
 17  Romance   

In [13]:
movies_oh = movies_oh.astype({'runtimeMinutes':int})

In [14]:
year_dict = {v : i for i, v in enumerate(movies_oh['startYear'].unique())}

In [15]:
movies_oh['startYear'] = movies_oh['startYear'].map(year_dict)

In [33]:
movies_oh.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613001 entries, 0 to 613000
Data columns (total 37 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   index           613001 non-null  int64  
 1   tconst          613001 non-null  object 
 2   titleType       613001 non-null  object 
 3   primaryTitle    613001 non-null  object 
 4   isAdult         613001 non-null  float64
 5   startYear       613001 non-null  int64  
 6   runtimeMinutes  613001 non-null  int32  
 7   genres          613001 non-null  object 
 8   unknown         613001 non-null  int64  
 9   Action          613001 non-null  int64  
 10  Adventure       613001 non-null  int64  
 11  Biography       613001 non-null  int64  
 12  Drama           613001 non-null  int64  
 13  Fantasy         613001 non-null  int64  
 14  Comedy          613001 non-null  int64  
 15  War             613001 non-null  int64  
 16  Documentary     613001 non-null  int64  
 17  Crime     

In [16]:
train = movies_oh.select_dtypes(exclude = 'object')

In [16]:
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn.fit(train)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=100)

In [38]:
dist, n = knn.kneighbors(np.array(train.loc[424536, :]).reshape(1,-1))

In [39]:
movies_oh.loc[n[0], 'primaryTitle']

424536                         Avengers: Age of Ultron
217368                                       Star Trek
186059                                        Iron Man
525723                                        Moonfall
224536                          The Amazing Spider-Man
                              ...                     
291390               RB Leipzig vs Paris Saint-Germain
280381                                    Tiger Killer
271032    Tottenham Hotspur vs Wolverhampton Wanderers
235737                                        Immortal
302623                   Leicester City vs Aston Villa
Name: primaryTitle, Length: 100, dtype: object

In [46]:
titles.loc[n[0], 'primaryTitle'].values

array(['Avengers: Age of Ultron', 'Star Trek', 'Iron Man', 'Moonfall',
       'The Amazing Spider-Man', 'War of the Worlds',
       'Star Trek VI: The Undiscovered Country', 'Spider-Man',
       'Independence Day: Resurgence', 'The Predator', 'Rampage',
       'Detras del paraÃ\xadso', 'The Terminator',
       'Alerta, alta tensiÃ³n', 'Nuk Rob Dum',
       'No Retreat, No Surrender 2', 'The Matrix Revolutions',
       '37 Plots of Kung Fu', 'Cheng fa', "Let's Get Harry", 'Sky Riders',
       'Hong fen dong jiang hu', 'Hanuman and the 5 Kamen Riders',
       'The Mighty One', 'To Subdue the Evil', 'Precognition',
       'Revenge of the Iron-Fist Maiden', 'Prelude to Infusco', 'Jil',
       'Return to Role Playing', 'Most Welcome', 'Never Quite the Same',
       'Return Of Gadar - Ek Desh Premi',
       'Hari ng selda: Anak ni Baby Ama 2', 'Uchathula Shiva',
       'Killer Punjabi', 'Shrestha Bangali', 'Soy como el Tigre',
       'Chung kuo ren', 'Conquest of the Planet of the Apes', 'Br

In [24]:
a = movies_oh[movies_oh['primaryTitle'].str.contains('Spider')].primaryTitle

In [25]:
a = pd.DataFrame(a)

In [31]:
a['primaryTitle']

1802                              The Gilded Spider
2220                         The Spider and the Fly
2221                                     The Spider
4683        The Spiders - Episode 1: The Golden Sea
5469      The Spiders - Episode 2: The Diamond Ship
                            ...                    
605715                           Spider and the Fly
607533                                       Spider
609096                                Paper Spiders
610248                       Spider-Man 4: Fan Film
610574                                   Spiderhead
Name: primaryTitle, Length: 212, dtype: object

In [27]:
a[a['primaryTitle'].str.contains('Man')].primaryTitle

59

In [19]:
titles = movies_oh['primaryTitle']
titles = pd.DataFrame(titles)
titles[titles['primaryTitle'] == 'The Pelican'].index[0]

51405

In [25]:
movies_oh_rating = pd.merge(movies_oh, ratings, 'left', on = 'tconst')

In [26]:
movies_oh_rating.fillna(0, inplace = True)

In [27]:
train_rating = movies_oh_rating.select_dtypes(exclude = 'object')

In [51]:
train_rating

Unnamed: 0,isAdult,startYear,runtimeMinutes,unknown,Action,Adventure,Biography,Drama,Fantasy,Comedy,...,Musical,Film-Noir,News,Short,Adult,Reality-TV,Talk-Show,Game-Show,averageRating,numVotes
0,0.0,0,100,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4.5,14.0
1,0.0,1,70,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,6.0,772.0
2,0.0,2,90,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,4.5,18.0
3,0.0,2,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,4.5,23.0
4,0.0,3,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,3.9,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612996,0.0,121,57,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
612997,0.0,111,100,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
612998,0.0,117,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0.0,0.0
612999,0.0,104,116,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,8.4,6.0


In [28]:
knn2 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn2.fit(train_rating)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=100)

In [41]:
dist2, n2 = knn2.kneighbors(np.array(train_rating.loc[424536, :]).reshape(1,-1))

In [47]:
movies_oh_rating.loc[n2[0], 'primaryTitle'].values

array(['Avengers: Age of Ultron', 'The Martian', 'Doctor Strange',
       'Arrival', 'The Incredibles', 'City of God',
       'Captain America: The Winter Soldier',
       'Star Wars: Episode I - The Phantom Menace', 'Life Is Beautiful',
       'Iron Man 3', 'Iron Man 2', 'Monty Python and the Holy Grail',
       'Suicide Squad', 'Sin City', 'Thor: Ragnarok',
       'Harry Potter and the Deathly Hallows: Part 2', 'Gran Torino',
       'Gone Girl', 'Ratatouille', 'Captain America: The First Avenger',
       'District 9', 'Home Alone', 'Logan', 'The Hunger Games',
       'Kill Bill: Vol. 2', 'Dunkirk',
       'Star Wars: Episode VII - The Force Awakens',
       'Star Wars: Episode III - Revenge of the Sith',
       'The Social Network', "Pan's Labyrinth", 'Silver Linings Playbook',
       'Spider-Man', 'Man of Steel', 'Full Metal Jacket', 'The Pianist',
       'Blade Runner', "The King's Speech", 'Black Panther', 'Black Swan',
       '300', 'Million Dollar Baby', 'The Big Lebowski',
    

In [32]:
rating_only = pd.merge(movies_oh[['tconst','primaryTitle', 'isAdult','startYear']], ratings, 'left', on = 'tconst')

In [33]:
rating_only = rating_only.fillna(0)
train_rating_only = rating_only.select_dtypes(exclude = 'object')

In [None]:
train_rating_only

In [34]:
from sklearn.neighbors import NearestNeighbors
knn3 = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=100, n_jobs=-1)
knn3.fit(train_rating_only)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=100)

In [43]:
dist3, n3 = knn3.kneighbors(np.array(train_rating_only.loc[424536, :]).reshape(1,-1))

In [48]:
movies_oh_rating.loc[n3[0], 'primaryTitle'].values

array(['Avengers: Age of Ultron', 'Thor', 'Gravity',
       'The Hobbit: An Unexpected Journey',
       'Captain America: The First Avenger', 'The Martian',
       'Doctor Strange', 'Iron Man 2', 'The Hangover', 'Iron Man 3',
       'The Intouchables', 'Gran Torino', 'Logan',
       'Batman v Superman: Dawn of Justice', 'Thor: Ragnarok',
       'Star Wars: Episode II - Attack of the Clones', '12 Monkeys',
       'Million Dollar Baby', 'The Incredibles', 'I Am Legend',
       'Home Alone', 'Harry Potter and the Deathly Hallows: Part 2',
       'Jaws', 'Toy Story 3', 'City of God', 'Black Swan',
       'Suicide Squad', 'Heat', 'Arrival',
       "Pirates of the Caribbean: Dead Man's Chest", 'Rocky', 'Radhe',
       'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb',
       'The Fast and the Furious', 'Kill Bill: Vol. 2',
       'Star Wars: Episode I - The Phantom Menace', 'Man of Steel',
       'Captain America: Civil War',
       'Captain America: The Winter Soldier',

In [55]:
a = {1:5,2:6,7:8}

In [56]:
a[] = min(a.values())

5

In [49]:
with open('knn.pickle', 'wb') as f:
    pk.dump(knn2, f)
with open('knn2.pickle', 'wb') as f:
    pk.dump(knn3,f)
with open('titles.pickle','wb') as f:
    pk.dump(titles, f)

In [53]:

with open('train_rating.pickle','wb') as f:
    pk.dump(train_rating, f)
with open('train_rating_only.pickle','wb') as f:
    pk.dump(train_rating_only, f)