# Recommender systems
- The objective is to recommend top 5 similar moves to the one we will consider

In [1]:
# Importing the basic libraries
"""
Cut-off year: 2020
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Importing the data

data = pd.read_csv('imdb_top_1000.csv')

In [3]:
data.head(3)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


In [4]:
data['Series_Title'] = data['Series_Title'].str.lower()

In [5]:
# Basic inspection of the data

data.head(3)

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,the shawshank redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,the godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,the dark knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444


In [6]:
100*data.isnull().sum()/data.shape[0]

Poster_Link       0.0
Series_Title      0.0
Released_Year     0.0
Certificate      10.1
Runtime           0.0
Genre             0.0
IMDB_Rating       0.0
Overview          0.0
Meta_score       15.7
Director          0.0
Star1             0.0
Star2             0.0
Star3             0.0
Star4             0.0
No_of_Votes       0.0
Gross            16.9
dtype: float64

In [7]:
# Selecting the most relevant fields based on my understaning

cols = ['Series_Title', 'Genre', 'IMDB_Rating', 'Overview']
filtered_data = data[cols]

In [9]:
filtered_data.head()

Unnamed: 0,Series_Title,Genre,IMDB_Rating,Overview
0,the shawshank redemption,Drama,9.3,Two imprisoned men bond over a number of years...
1,the godfather,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...
2,the dark knight,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...
3,the godfather: part ii,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...
4,12 angry men,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...


In [10]:
# Genre

vectorize = CountVectorizer()
genre = vectorize.fit_transform(filtered_data['Genre'])
genre = pd.DataFrame(genre.toarray(), columns=vectorize.get_feature_names())

In [11]:
genre

Unnamed: 0,action,adventure,animation,biography,comedy,crime,drama,family,fantasy,fi,...,music,musical,mystery,noir,romance,sci,sport,thriller,war,western
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,1,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
996,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
997,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
998,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [12]:
# Overview

text_bert = SentenceTransformer('distilbert-base-nli-mean-tokens')
embeddings = text_bert.encode(filtered_data['Overview'], show_progress_bar=True)

2023-03-04 13:47:16.165 INFO    sentence_transformers.SentenceTransformer: Load pretrained SentenceTransformer: distilbert-base-nli-mean-tokens
2023-03-04 13:47:18.262 INFO    sentence_transformers.SentenceTransformer: Use pytorch device: cpu


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

In [13]:
# Final data

final_data = pd.concat([filtered_data['Series_Title'], pd.DataFrame(embeddings), genre], axis = 1)

In [14]:
final_data.head()

Unnamed: 0,Series_Title,0,1,2,3,4,5,6,7,8,...,music,musical,mystery,noir,romance,sci,sport,thriller,war,western
0,the shawshank redemption,-0.082307,-0.454634,1.367879,-0.779558,0.174771,-0.604249,0.23113,-0.606564,0.010987,...,0,0,0,0,0,0,0,0,0,0
1,the godfather,-1.020697,-1.051921,0.163983,-1.064686,0.254996,-0.841295,0.063894,-0.731421,0.441186,...,0,0,0,0,0,0,0,0,0,0
2,the dark knight,-0.909217,-0.519028,0.496283,-0.862778,-0.879727,-0.459598,-0.079323,-0.656419,0.608221,...,0,0,0,0,0,0,0,0,0,0
3,the godfather: part ii,-0.553091,-0.606065,-0.263538,-2.065413,0.10661,-0.200945,0.303857,-0.427347,-0.092097,...,0,0,0,0,0,0,0,0,0,0
4,12 angry men,-0.21158,-0.223869,0.814861,-0.198401,-0.739711,-0.256405,0.210274,-0.757677,0.993326,...,0,0,0,0,0,0,0,0,0,0


In [15]:
sim = cosine_similarity(final_data.iloc[:, 1:])

In [16]:
sim

array([[1.        , 0.55099273, 0.35107367, ..., 0.43602519, 0.38917322,
        0.39115755],
       [0.55099273, 1.        , 0.50341246, ..., 0.58922288, 0.46418841,
        0.66466949],
       [0.35107367, 0.50341246, 1.        , ..., 0.49103009, 0.42506804,
        0.69154531],
       ...,
       [0.43602519, 0.58922288, 0.49103009, ..., 1.        , 0.62514104,
        0.56816541],
       [0.38917322, 0.46418841, 0.42506804, ..., 0.62514104, 1.        ,
        0.48206382],
       [0.39115755, 0.66466949, 0.69154531, ..., 0.56816541, 0.48206382,
        1.        ]])

In [17]:
sim = pd.DataFrame(sim, columns = list(final_data['Series_Title']))

In [18]:
sim

Unnamed: 0,the shawshank redemption,the godfather,the dark knight,the godfather: part ii,12 angry men,the lord of the rings: the return of the king,pulp fiction,schindler's list,inception,fight club,...,giù la testa,kelly's heroes,the jungle book,blowup,a hard day's night,breakfast at tiffany's,giant,from here to eternity,lifeboat,the 39 steps
0,1.000000,0.550993,0.351074,0.422941,0.464949,0.408991,0.516575,0.354920,0.384113,0.319785,...,0.306995,0.363586,0.413917,0.488935,0.496330,0.373068,0.417010,0.436025,0.389173,0.391158
1,0.550993,1.000000,0.503412,0.787955,0.519569,0.493224,0.593899,0.614816,0.662258,0.523168,...,0.571160,0.522780,0.536339,0.560425,0.604634,0.560936,0.463103,0.589223,0.464188,0.664669
2,0.351074,0.503412,1.000000,0.560756,0.488269,0.562562,0.592834,0.542004,0.671931,0.624719,...,0.658307,0.604470,0.596764,0.517447,0.559856,0.444428,0.290646,0.491030,0.425068,0.691545
3,0.422941,0.787955,0.560756,1.000000,0.411164,0.457117,0.694926,0.601915,0.583501,0.588561,...,0.598265,0.519318,0.499737,0.469995,0.610808,0.596789,0.459332,0.539739,0.425305,0.569712
4,0.464949,0.519569,0.488269,0.411164,1.000000,0.491723,0.459924,0.448772,0.558442,0.466276,...,0.434773,0.440157,0.505383,0.525041,0.516316,0.410094,0.309656,0.528852,0.404672,0.611162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.373068,0.560936,0.444428,0.596789,0.410094,0.248858,0.503081,0.526750,0.516277,0.526606,...,0.402814,0.462259,0.456100,0.566143,0.614558,1.000000,0.291012,0.627293,0.394228,0.548888
996,0.417010,0.463103,0.290646,0.459332,0.309656,0.368946,0.567687,0.396041,0.320285,0.353440,...,0.566120,0.370757,0.370657,0.461072,0.421263,0.291012,1.000000,0.418257,0.359436,0.315029
997,0.436025,0.589223,0.491030,0.539739,0.528852,0.436906,0.595189,0.570849,0.567647,0.541245,...,0.588254,0.517403,0.462313,0.597169,0.673412,0.627293,0.418257,1.000000,0.625141,0.568165
998,0.389173,0.464188,0.425068,0.425305,0.404672,0.477580,0.565273,0.567945,0.516572,0.514620,...,0.552595,0.573768,0.343801,0.528133,0.473091,0.394228,0.359436,0.625141,1.000000,0.482064


In [19]:
sim.index = list(final_data['Series_Title'])

In [20]:
sim.head()

Unnamed: 0,the shawshank redemption,the godfather,the dark knight,the godfather: part ii,12 angry men,the lord of the rings: the return of the king,pulp fiction,schindler's list,inception,fight club,...,giù la testa,kelly's heroes,the jungle book,blowup,a hard day's night,breakfast at tiffany's,giant,from here to eternity,lifeboat,the 39 steps
the shawshank redemption,1.0,0.550993,0.351074,0.422941,0.464949,0.408991,0.516575,0.35492,0.384113,0.319785,...,0.306995,0.363586,0.413917,0.488935,0.49633,0.373068,0.41701,0.436025,0.389173,0.391158
the godfather,0.550993,1.0,0.503412,0.787955,0.519569,0.493224,0.593899,0.614816,0.662258,0.523168,...,0.57116,0.52278,0.536339,0.560425,0.604634,0.560936,0.463103,0.589223,0.464188,0.664669
the dark knight,0.351074,0.503412,1.0,0.560756,0.488269,0.562562,0.592834,0.542004,0.671931,0.624719,...,0.658307,0.60447,0.596764,0.517447,0.559856,0.444428,0.290646,0.49103,0.425068,0.691545
the godfather: part ii,0.422941,0.787955,0.560756,1.0,0.411164,0.457117,0.694926,0.601915,0.583501,0.588561,...,0.598265,0.519318,0.499737,0.469995,0.610808,0.596789,0.459332,0.539739,0.425305,0.569712
12 angry men,0.464949,0.519569,0.488269,0.411164,1.0,0.491723,0.459924,0.448772,0.558442,0.466276,...,0.434773,0.440157,0.505383,0.525041,0.516316,0.410094,0.309656,0.528852,0.404672,0.611162


In [23]:
def recommendations(movie_name):
    """
    This function will provide the 5 movie recommendations given a movie that is watched by the user.
    """
    movie_name = movie_name.lower()
    recommendations = sim.loc[movie_name].sort_values(ascending=False)[1:6]
    recommendations = recommendations.reset_index()
    recommendations.columns = ['Series_Title', 'score']
    recommendations = pd.merge(recommendations, data[['Series_Title', 'Genre', 'IMDB_Rating', 'Director']], on='Series_Title')
    #print(data[data['Series_Title'] == movie_name][['Series_Title', 'Genre', 'IMDB_Rating', 'Director']])
    return recommendations.iloc[:5]

In [24]:
recommendations('The Godfather')

Unnamed: 0,Series_Title,score,Genre,IMDB_Rating,Director
0,haider,0.806302,"Action, Crime, Drama",8.1,Vishal Bhardwaj
1,the pursuit of happyness,0.804626,"Biography, Drama",8.0,Gabriele Muccino
2,the godfather: part iii,0.800571,"Crime, Drama",7.6,Francis Ford Coppola
3,manchester by the sea,0.788229,Drama,7.8,Kenneth Lonergan
4,the godfather: part ii,0.787955,"Crime, Drama",9.0,Francis Ford Coppola


In [23]:
#recommendations('The Lion King')

In [24]:
#recommendations('The Dark Knight')

In [25]:
#recommendations('Avatar')

In [26]:
#recommendations('The Terminator')

In [27]:
#recommendations('The Matrix')


In [28]:
def main():
    st.title("Recommendation System for Movies")
    html_temp = """
    <div style="background-color:tomato;padding:10px">
    <h2 style="color:white;text-align:center;">Recommendation System App</h2>
    </div>
    """

    st.markdown(html_temp, unsafe_allow_html=True)
    text_input = st.text_input("Enter the name of a Movie", "Type here")
    result = ""
    if st.button("Find Movies"):
        result = recommendations(text_input)
    st.success("The Movies are {}".format(list(result)))

In [29]:
if __name__ == '__main__':
    main()

2023-01-31 17:58:25.175 
  command:

    streamlit run c:\Users\mirmm\anaconda3_\lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
