# **Knowledge-Based Recommender System on IMDB movies extensive dataset**

In [None]:
import pandas as pd

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
def load_file_from_drive(id, name):
    downloaded = drive.CreateFile({'id':id})
    downloaded.GetContentFile(name)

### **Dataset Overview**

dataset complete information is available in this [link](https://www.kaggle.com/stefanoleone992/imdb-extensive-dataset) 

The movies dataset includes 85,855 movies with 22 attributes containing movie description, average rating, number of votes, genre, etc.


In [None]:
load_file_from_drive('1ENzzJX8kE0dkvy92aTHdzSDMn_CIVixA', 'imdb_movies.csv')
df = pd.read_csv('imdb_movies.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,Alexander Black,Alexander Black Photoplays,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [None]:
df.columns

Index(['imdb_title_id', 'title', 'original_title', 'year', 'date_published',
       'genre', 'duration', 'country', 'language', 'director', 'writer',
       'production_company', 'actors', 'description', 'avg_vote', 'votes',
       'budget', 'usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics'],
      dtype='object')

since we dont actually need all of the columns, we select the ones we want to use.

In [None]:
df = df[['title','genre','duration','year','country','language','director','avg_vote','votes','description']]

In [None]:
df.head()

Unnamed: 0,title,genre,duration,year,country,language,director,avg_vote,votes,description
0,Miss Jerry,Romance,45,1894,USA,,Alexander Black,5.9,154,The adventures of a female reporter in the 1890s.
1,The Story of the Kelly Gang,"Biography, Crime, Drama",70,1906,Australia,,Charles Tait,6.1,589,True story of notorious Australian outlaw Ned ...
2,Den sorte drøm,Drama,53,1911,"Germany, Denmark",,Urban Gad,5.8,188,Two men of high rank are both wooing the beaut...
3,Cleopatra,"Drama, History",100,1912,USA,English,Charles L. Gaskill,5.2,446,The fabled queen of Egypt's affair with Roman ...
4,L'Inferno,"Adventure, Drama, Fantasy",68,1911,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",7.0,2237,Loosely adapted from Dante's Divine Comedy and...


we dont want Nan values for columns so we can replace them with empty strings.

In [None]:
df_missing = df.isna()
df_missing.head()
num_missing = df_missing.sum()
num_missing/len(df)

title          0.000000
genre          0.000000
duration       0.000000
year           0.000000
country        0.000745
language       0.009702
director       0.001013
avg_vote       0.000000
votes          0.000000
description    0.024635
dtype: float64

In [None]:
df['language'] = df['language'].fillna('')
df['director'] = df['director'].fillna('')
df['country'] = df['country'].fillna('')
df['description'] = df['description'].fillna('')

then we need to make sure the types of the each column are correct (especially the numeric columns)bold text

In [None]:
df.year=df.year.replace('TV Movie 2019',2019).astype(int)
for column in df.columns:
    try:
        df[column] = pd.to_numeric(df[column])
    except:
        df[column] = df[column].astype("string")
df['genre'] = df['genre'].apply(lambda x: x.lower())

In [None]:
df.dtypes

title           string
genre           object
duration         int64
year             int64
country         string
language        string
director        string
avg_vote       float64
votes            int64
description     string
dtype: object

## **Recommendation based on user preferences via filters**

for recemmondation, first we have to filter the movies that satisfy the preferences. then rate those movies based on a metric and output these in decreasing order of rating.

The metric is the numeric quantity based on which you rank movies. A movie is considered to be better than another movie if it has a higher metric score than the other movie.

here we use the IMDB’s weighted rating formula as our metric, which is as follows:

$(\frac{v}{v+m}\times R)+(\frac{m}{v+m}\times C)$

where 
*    v is the number of votes for the movie
*    m is the minimum number of votes required for the movie to be in the chart (the prerequisite)
*    R is the mean rating of the movie
*    C is the mean rating of all the movies in the dataset


In [None]:
def imdb_weighted_rating(movie, c, m):
    nv = movie['votes']
    av = movie['avg_vote']
    rating = (nv/(nv+m) * av) + (m/(m+nv) * c)
    return rating

In [None]:
def get_recemmendations(movies, genre='',min_dur=0,max_dur=0,min_year=0,max_year=0, language='',director='',recomm_count=10, percent=0.8):
    if genre:
        movies = movies[movies['genre'].apply(lambda x: len(set(x.split(', ')).intersection(set(genre)))>0)] 
    if min_dur != 0:
        movies = movies[movies['duration']>=min_dur]
    if max_dur != 0:
        movies = movies[movies['duration']<=max_dur]
    if min_year != 0:
        movies = movies[movies['year']>=min_year]   
    if max_year != 0:
        movies = movies[movies['year']<=max_year]
    if language != '':
        movies = movies[movies['language'].apply(lambda x: language in x.split(', '))]
    if director != '': 
        movies = movies[movies['director']==director]

    all_votes_avg = movies['avg_vote'].mean()
    min_req_vote = movies['votes'].quantile(percent)
    result = movies.copy().loc[movies['votes'] >= min_req_vote]

    result['rating'] = result.apply(imdb_weighted_rating, c=all_votes_avg, m=min_req_vote, axis=1)
    result = result.sort_values('rating',ascending = False)[:recomm_count]

    result['title'] = result['title'].apply(lambda x: x.title())
    
    return result

In [None]:
#@title Enter you preferences
#@markdown in the form below, enter your preferences (leave them as default if it doesn't matter)

#@markdown (you can enter multiple genres seperated by comma)
genre = "comedy, romance" #@param {type:"string"}
min_duration =  0#@param {type:"integer"}
max_duration =  120#@param {type:"integer"}
min_year =  1950#@param {type:"integer"}
max_year =0  #@param {type:"integer"}
language = "" #@param {type:"string"}
director = "" #@param {type:"string"}
recommendation_count = 10 #@param {type:"integer"}

if genre:
    genre = [g.lower().strip() for g in genre.split(',')]

In [None]:
get_recemmendations(df, genre=genre,min_dur=min_duration,max_dur=max_duration,
                    min_year=min_year, max_year=max_year, language=language.lower(),
                    director=director.lower(), recomm_count=10)

Unnamed: 0,title,genre,duration,year,country,language,director,avg_vote,votes,description,rating
38406,Hababam Sinifi,"comedy, drama",87,1975,Turkey,Turkish,Ertem Egilmez,9.3,36269,"Lazy, uneducated students share a very close b...",9.024274
83783,Dil Bechara,"comedy, drama, romance",101,2020,India,Hindi,Mukesh Chhabra,8.8,101686,The emotional journey of two hopelessly in lov...,8.711264
38407,Hababam Sinifi Sinifta Kaldi,comedy,91,1976,Turkey,Turkish,Ertem Egilmez,9.0,21178,A young and beautiful female teacher starts wo...,8.591576
38490,Tosun Pasa,"comedy, history",90,1976,Turkey,Turkish,Kartal Tibet,9.0,20968,Late 19th century in Alexandria. Two tradition...,8.588016
30454,La Vita È Bella,"comedy, drama, romance",116,1997,Italy,"Italian, German, English",Roberto Benigni,8.6,605648,When an open-minded Jewish librarian and his s...,8.585748
21811,Ritorno Al Futuro,"adventure, comedy, sci-fi",116,1985,USA,English,Robert Zemeckis,8.5,1027330,"Marty McFly, a 17-year-old high school student...",8.491883
60741,Quasi Amici - Intouchables,"biography, comedy, drama",112,2011,France,"French, English","Olivier Nakache, Éric Toledano",8.5,736691,After he becomes a quadriplegic from a paragli...,8.488695
38483,Süt Kardesler,"comedy, family, mystery",80,1976,Turkey,Turkish,Ertem Egilmez,8.9,17971,"Saban, Ramazan and Bayram are sailors in an Ot...",8.444277
20502,Maratonci Trce Pocasni Krug,"comedy, drama",92,1982,Yugoslavia,Serbian,Slobodan Sijan,9.0,14088,The Topalovic family has been in the burial bu...,8.423314
11445,Il Dottor Stranamore - Ovvero: Come Ho Imparat...,comedy,95,1964,"UK, USA","English, Russian",Stanley Kubrick,8.4,441115,An insane general triggers a path to nuclear h...,8.381875


## **Content Based Recommendation**

In [None]:
df_small = df.sample(frac = 1).iloc[:20000, : ]
df_small = df_small.reset_index(drop=True)

In [None]:
df_small.head()

Unnamed: 0,title,genre,duration,year,country,language,director,avg_vote,votes,description
0,Ragazze contro,drama,112,1998,"France, USA","French, English",Susan Skoog,6.4,963,A teen faces her impending adulthood in the ca...
1,Rivista di stelle,"comedy, musical",93,1947,USA,English,George Marshall,6.4,328,Almost everyone under contract to Paramount Pi...
2,Karate baka ichidai,"action, biography, drama",91,1977,Japan,Japanese,Kazuhiko Yamaguchi,6.7,207,This movie is based on the true life story of ...
3,Gioco di donna,"drama, romance, war",132,2004,"UK, Canada","English, German, French, Spanish",John Duigan,6.6,14488,From the roaring 1920s to the ruinous Spanish ...
4,Morto tra una settimana... O ti ridiamo i soldi,"comedy, drama",90,2018,UK,English,Tom Edmunds,6.2,4742,After his ninth unsuccessful attempt on his ow...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#Define a TF-IDF Vectorizer Object. Remove all english stop words
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

df_small['description'] = df_small['description'].fillna('')
df_small['description'] = df_small['description'].apply(lambda x:x.lower())

#Construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df_small['description'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(20000, 5000)

In [None]:
from sklearn.metrics.pairwise import linear_kernel

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df_small.index, index=df_small['title']).drop_duplicates()

In [None]:
# Function that takes in movie title as input and outputs most similar movies
def get_content_based_recommendations(title, cosine_sim=cosine_sim,recomm_count=10):
    # Get the index of the movie that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:recomm_count+1]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    return df_small['title'].iloc[movie_indices]


In [None]:
ex = df_small.iloc[100,0]  #cause the movie must be in dataset otherwise it will raise an error.
print(f'Recommendations based on movie : {ex}')
get_content_based_recommendations(ex)

Recommendations based on movie : Cloud


2384                        His First Flame
3850                      The Four Feathers
2630                              Unashamed
17308        Harry Styles: Behind the Album
7543                             Holy Lands
17240                      California Dolls
7549                     Tre piccole parole
17792    Animali fantastici e dove trovarli
1747      Harry Potter e il calice di fuoco
4085                     Breaking & Exiting
Name: title, dtype: string