# movrec
movie recommender system

1. read and process movie_titles.csv

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
with open('movies/movie_titles.csv') as f:
    movies = f.readlines()
    movies = [movie_data.replace('\n','').split(',',2) for movie_data in movies]
    df_movies = pd.DataFrame(movies, columns=['movie_id', 'year', 'name'])
    df_movies = df_movies.drop(['year'], axis=1)
df_movies

Unnamed: 0,movie_id,name
0,1,Dinosaur Planet
1,2,Isle of Man TT 2004 Review
2,3,Character
3,4,Paula Abdul's Get Up & Dance
4,5,The Rise and Fall of ECW
...,...,...
17765,17766,Where the Wild Things Are and Other Maurice Se...
17766,17767,Fidel Castro: American Experience
17767,17768,Epoch
17768,17769,The Company


In [4]:
num_movies = df_movies['movie_id'].nunique()
print('number of movies in the dataset:', num_movies)

number of movies in the dataset: 17770


2. read and process training sets

In [5]:
first_id = [1, 4500, 9211, 13368]
def read_training_set(num_file):
    file_path = 'movies/combined_data_' + str(num_file) + '.txt'
    df =  pd.read_csv(file_path \
                    ,names=['user_id', 'rating', 'time_stamp']  \
                    ,usecols=['user_id', 'rating'])
    df['user_id'] = df['user_id'].astype('string')

    with_ids = df['user_id'].str.contains(':')
    df_ids = df[with_ids]

    idx = df_ids.index
    movie_ids = np.zeros((len(df), ))

    m_id = first_id[num_file - 1] - 1
    for i in range(0, len(idx)):
        m_id += 1
        if i==len(idx)-1:
            movie_ids[idx[i]+1: len(df)] = m_id * np.ones((len(df) - idx[i] - 1, ))
        else:    
            movie_ids[idx[i]+1: idx[i+1]] = m_id * np.ones((idx[i+1] - idx[i] - 1, ))

    df['movie_id'] = movie_ids.astype(int)
    df = df[~with_ids]

    return df

In [6]:
train1 = read_training_set(1)
train1

Unnamed: 0,user_id,rating,movie_id
1,1488844,3.0,1
2,822109,5.0,1
3,885013,4.0,1
4,30878,4.0,1
5,823519,3.0,1
...,...,...,...
24058258,2591364,2.0,4499
24058259,1791000,2.0,4499
24058260,512536,5.0,4499
24058261,988963,3.0,4499


In [7]:
train2 = read_training_set(2)
train2

Unnamed: 0,user_id,rating,movie_id
1,2532865,4.0,4500
2,573364,3.0,4500
3,1696725,3.0,4500
4,1253431,3.0,4500
5,1265574,2.0,4500
...,...,...,...
26982297,2420260,1.0,9210
26982298,761176,3.0,9210
26982299,459277,3.0,9210
26982300,2407365,4.0,9210


In [8]:
train3 = read_training_set(3)
train3

Unnamed: 0,user_id,rating,movie_id
1,1277134,1.0,9211
2,2435457,2.0,9211
3,2338545,3.0,9211
4,2218269,1.0,9211
5,441153,4.0,9211
...,...,...,...
22605781,2339129,4.0,13367
22605782,59005,4.0,13367
22605783,1789683,5.0,13367
22605784,1878798,1.0,13367


In [9]:
train4 = read_training_set(4)
train4

Unnamed: 0,user_id,rating,movie_id
1,2385003,4.0,13368
2,659432,3.0,13368
3,751812,2.0,13368
4,2625420,2.0,13368
5,1650301,1.0,13368
...,...,...,...
26851921,1790158,4.0,17770
26851922,1608708,3.0,17770
26851923,234275,1.0,17770
26851924,255278,4.0,17770


In [10]:
train = pd.concat([train1, train2, train3, train4])
train.index = pd.RangeIndex(0, len(train), 1)
train

Unnamed: 0,user_id,rating,movie_id
0,1488844,3.0,1
1,822109,5.0,1
2,885013,4.0,1
3,30878,4.0,1
4,823519,3.0,1
...,...,...,...
100480502,1790158,4.0,17770
100480503,1608708,3.0,17770
100480504,234275,1.0,17770
100480505,255278,4.0,17770


In [10]:
'''from sklearn.utils import shuffle
train = shuffle(train).reset_index(drop=True)
train'''

'from sklearn.utils import shuffle\ntrain = shuffle(train).reset_index(drop=True)\ntrain'

In [11]:
num_users = train['user_id'].nunique()
print('number of users:', num_users)

number of users: 480189


In [12]:
user_ids = train['user_id'].values.astype(int)
user_indices = user_ids - 1

In [13]:
movie_ids = train['movie_id'].values.astype(int)
movie_indices = movie_ids - 1

3. apply recommender system algorithm   
  3.1. build user-interation matrix (U)  
  3.2. k-nearest neighbors of U

In [14]:
from scipy.sparse import csr_matrix 
data = train['rating'].values
cols = user_indices
rows = movie_indices
U = csr_matrix((data, (rows,cols)), dtype=np.float16)

In [15]:
from sklearn.neighbors import NearestNeighbors

num_neighbors = 5
knn = NearestNeighbors(metric='cosine',n_neighbors=num_neighbors, algorithm='brute')
knn.fit(U)

knn model prediction

In [29]:
query_index = 15000

distances, indices = knn.kneighbors(
    U[query_index, :],
    n_neighbors=num_neighbors+1
)
distances = distances.flatten()
indices = indices.flatten()

(array([4.32986980e-15, 2.88156149e-01, 5.79782569e-01, 5.95385087e-01,
        6.00363819e-01, 6.04678002e-01]),
 array([15000,  6124,  8694,  4747,   429, 16113], dtype=int64))

In [30]:
for i in range(0,len(distances)):
    if i == 0:
        print("Recommendation for {0}:\n".format(df_movies['name'].iloc[indices[i]]))
    else:
        print("{0}: {1}, with distance of {2}:".format(i, df_movies['name'].iloc[indices[i]], distances[i]))

Recommendation for Ai Yori Aoshi:

1: Ai Yori Aoshi: Enishi, with distance of 0.2881561490196226:
2: Mahoromatic: Automatic Maiden, with distance of 0.579782568862439:
3: Mahoromatic 2: Something More Beautiful, with distance of 0.5953850870755895:
4: Chobits, with distance of 0.6003638186433184:
5: Love Hina Again: The Movie, with distance of 0.6046780020525651:
