## **Movie Recommendation System**

In [3]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import random
import os


def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [140]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
links = pd.read_csv('ml-latest-small/links.csv')

In [6]:
ratings['ones'] = 1
history = ratings.pivot_table(index='userId', columns='movieId', values='ones', fill_value=0)

In [7]:
history.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
data = ratings.merge(movies, how='left', on=['movieId'])

In [9]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,ones,title,genres,imdbId
0,1,1,4.0,964982703,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709
1,1,3,4.0,964981247,1,Grumpier Old Men (1995),Comedy|Romance,113228
2,1,6,4.0,964982224,1,Heat (1995),Action|Crime|Thriller,113277
3,1,47,5.0,964983815,1,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,114369
4,1,50,5.0,964982931,1,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,114814


### **Movies Rated By User (Watching List)**
- **Show for the user his watching list order by data, so last watched movie will appear in the start of the list**

In [124]:
def user_rated_movies(user_id):
    items_our_user_rated = sorted(data[data.userId==user_id][['movieId', 'title', 'genres','rating']].values, key=lambda x: x[3])
    return items_our_user_rated

In [125]:
user_rated_movies(9)[:5]

[array([5902, 'Adaptation (2002)', 'Comedy|Drama|Romance', 5.0],
       dtype=object),
 array([3173, 'Any Given Sunday (1999)', 'Drama', 2.0], dtype=object),
 array([5481, 'Austin Powers in Goldmember (2002)', 'Comedy', 5.0],
       dtype=object),
 array([1270, 'Back to the Future (1985)', 'Adventure|Comedy|Sci-Fi', 5.0],
       dtype=object),
 array([2011, 'Back to the Future Part II (1989)',
        'Adventure|Comedy|Sci-Fi', 3.0], dtype=object)]

### **Recommended Movies (Recommendation List)**
- **We used DeepFM model from  DeepCTR-Torch to predict the rating that users will give for movie the didn't watch or rate, and accourding to this rating we will know recommend that movie with high prediction rating for user in the beginning of the recommendation list**

##### **References**
- https://www.kaggle.com/code/leejunseok97/deepfm-movie-len-pytorch 
- https://deepctr-torch.readthedocs.io/en/latest/deepctr_torch.models.deepfm.html 

In [13]:
def not_watched(user_id):
    items_our_user_can_rate = data[~data.userId.isin(user_rated_movies(user_id))].movieId.values
    return items_our_user_can_rate

In [155]:
import pickle

saved_model = "model.pkl"
with open(saved_model, 'rb') as file:
    DeepFM_model = pickle.load(file)

def split(x):
    key2index= {}
    key_ans = x.split('|')
    for key in key_ans:
        if key not in key2index:
            key2index[key] = len(key2index) + 1
    return list(map(lambda x: key2index[x], key_ans))


def user_recommends(user_id):
    obs = {}
    
    not_watched_list = data[~data.userId.isin(user_rated_movies(user_id))][data.userId == user_id].values
    for movie_info in not_watched_list:

        obs['userId'] = pd.Series(0, LabelEncoder().fit_transform(np.array([user_id])))
        obs['movieId'] = pd.Series(0, LabelEncoder().fit_transform(np.array([movie_info[1]])))
        obs['genres'] = np.array(split(movie_info[6])).reshape(1, -1)
        data.loc[(data.userId == user_id) & (data.movieId == movie_info[1]), 'prediction'] = DeepFM_model.predict(obs)[0][0]
    
    return sorted(data[data.userId == user_id].values, key=lambda x: x[-1], reverse=True)

---------------------------------------------------------------------

### **Movie Info**
- **When the movie is selected, It's inforamtion will be diplayed (Image, )**

In [22]:
def movie_info(movie_id):
    try:
        return movies[movies.movieId == movie_id].values[0]
    except:
        return None

In [23]:
movie_info(10)

array([10, 'GoldenEye (1995)', 'Action|Adventure|Thriller'], dtype=object)

------------------------------------

### **Most Similar Movies**
- **When movie selected, We display the most similar movies to the selected movie**
- **We used Adjusted Cosine Similarity To calculate the most similar movies to the selected movies**

In [24]:
def adjusted_cosine_sim(vec_a, vec_b):

    a_avg = np.average(vec_a)
    b_avg = np.average(vec_b)
    
    sim_score = np.dot(vec_a - a_avg, vec_b - b_avg) / (np.linalg.norm(vec_a - a_avg) * np.linalg.norm(vec_b - b_avg))

    return sim_score

In [43]:
def most_sim(movie_id):
    sim_movies = []
    all_info = []
    utility_matrix = history
    for j in data.movieId.unique():
        sim_movies.append((j, adjusted_cosine_sim(utility_matrix.loc[:, movie_id], utility_matrix.loc[:, j])))
    
    sim_movies = sorted(sim_movies, key=lambda x: x[1], reverse=True)
    
    for mov in sim_movies:
        ret = movies[movies.movieId == mov[0]]
        if ret is None:
            all_info.append(mov)
        else:
            all_info.append(ret)
    return all_info

In [44]:
most_sim_movies = most_sim(1)[:20]

In [None]:
most_sim_movies[1].values[0]

array([3114, 'Toy Story 2 (1999)',
       'Adventure|Animation|Children|Comedy|Fantasy'], dtype=object)

-------------------------

### **Images Scraping**
- **We used selenium to scrap the movies' images**

In [56]:
from selenium import webdriver
import pandas as pd

In [52]:
links[links.movieId == 1]

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0


In [None]:
from selenium import webdriver
import pandas as pd
import urllib
driver = webdriver.Chrome()
for imdb_id, movie_id in zip(links.imdbId.unique()[3474:], links.movieId.unique()[3474:]):
    i = 7 - len(str(imdb_id))
    imdb = "0" * i + str(imdb_id)
    driver.get(f"https://www.imdb.com/title/tt{imdb}/")
    url = driver.find_element("xpath", "//img[@class='ipc-image']")
    url = url.get_attribute('src')
    urllib.request.urlretrieve(str(url),f"./Images/{movie_id}.jpg")
driver.quit()

In [163]:
def get_image(movie_id):
    try:
        return f"Images/{movie_id}.jpg"
    except:
        return "img.jpg"

In [164]:
get_image(4736)

'Images/4736.jpg'