# Content-based Recommender

Sistemi za preporuku bazirani na sadrzaju rade na osnovu informacija o filmovima, kao sto su zanr filma, oznake (etiketa) koja asocira na odredjeni film, glumacka ekipa itd.

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

import warnings; warnings.simplefilter('ignore')

In [2]:
movies_df = pd.read_csv('dataset/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
tags_df = pd.read_csv('dataset/tags.csv')
tags_df.shape

(3683, 4)

In [4]:
tags_df['title']=movies_df.title[movies_df['movieId'].isin(tags_df['movieId'])]
tags_df=tags_df.dropna()
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp,title
0,2,60756,funny,1445714994,Toy Story (1995)
1,2,60756,Highly quotable,1445714996,Jumanji (1995)
2,2,60756,will ferrell,1445714992,Grumpier Old Men (1995)
4,2,89774,MMA,1445715200,Father of the Bride Part II (1995)
6,2,106782,drugs,1445715054,Sabrina (1995)


In [5]:
tags_df.shape # dimenzija podataka o oznakama, nakon izostavljanja nedostajucih podataka

(883, 5)

### Izbor filmova za preporuku na osnovu oznaka o filmovima

In [6]:
tf = TfidfVectorizer(analyzer='word', stop_words='english')
tfidf_matrix = tf.fit_transform(tags_df['tag'])

In [7]:
print('Neke rijeci koje se pojavljuju medju oznakama:\n', tf.get_feature_names()[:50])

Neke rijeci koje se pojavljuju medju oznakama:
 ['06', '1970s', '1980s', '250', '70mm', '80', 'aardman', 'absorbing', 'abstract', 'abuse', 'achronological', 'action', 'actors', 'actress', 'adam', 'addiction', 'adolescence', 'ads', 'adultery', 'adventure', 'afghanistan', 'africa', 'aging', 'aids', 'al', 'alan', 'alba', 'alcoholism', 'alcott', 'aliens', 'alternate', 'america', 'american', 'androids', 'andy', 'animal', 'animation', 'anime', 'anne', 'apocalypse', 'apocalyptic', 'appealing', 'archaeology', 'arnold', 'arthouse', 'arthur', 'arts', 'assassin', 'assassination', 'assassins']


In [8]:
tfidf_matrix.shape # u 883 oznake koje opisuju filmove javlja se 660 razlicitih rijeci

(883, 660)

In [9]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [10]:
tags_df = tags_df.reset_index()
titles = tags_df['title']
indices = pd.Series(tags_df.index, index=tags_df['title'])

### Prikaz 20 filmova za preporuku na osnovu slicnosti zadatom filmu po opisu (oznaci)

In [11]:
def get_recommendations(title): # zadaje se naslov filma
    idx = indices[title] # indeks zadatog filma
    sim_scores = list(enumerate(cosine_sim[idx])) # slicnost sa ostalim filmovima
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) # sortiranje na osnovu slicnosti opadajuce
    sim_scores = sim_scores[1:21] # na 0. poziciji je slicnost sa samim sobom
    movie_indices = [i[0] for i in sim_scores]
    return pd.DataFrame(titles.iloc[movie_indices])

In [12]:
get_recommendations('Toy Story (1995)')

Unnamed: 0,title
67,"Little Princess, A (1995)"
70,Miracle on 34th Street (1994)
76,Léon: The Professional (a.k.a. The Professiona...
158,"World of Apu, The (Apur Sansar) (1959)"
297,Apocalypse Now (1979)
818,Divided We Fall (Musíme si pomáhat) (2000)
1,Jumanji (1995)
2,Grumpier Old Men (1995)
3,Father of the Bride Part II (1995)
4,Sabrina (1995)


### Prikaz 20 filmova za preporuku na osnovu slicnosti zadatom filmu po zanru

In [13]:
movies_df = movies_df.join(movies_df.genres.str.get_dummies("|")) # kodiranje kolone koja sadrzi podatke o zanru

In [14]:
# izracunavanje slicnosti
cos_sim = cosine_similarity(movies_df.iloc[:,3:])

In [15]:
cos_sim.shape

(9742, 9742)

In [16]:
movies_df = movies_df.reset_index()
titles = movies_df['title']
indices = pd.Series(movies_df.index, index=movies_df['title'])

In [17]:
def get_recommendations(title): # zadaj seo naslov filma
    
    idx = indices[title] # indeks zadatog filma
    
    sim_scores = list(enumerate(cos_sim[idx])) # lista slicnosti odabrabog filma sa ostalim filmovima 
    # lista za svaki element sadrzi 2 vrijednosti: 1. je indeks, 2. je slicnost  
    '''
    sortiranje elemenata liste opadajuce po slicnosti
    lambda x: x[1] => vraca element na 1. poziciji
    '''
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True) 
    
    sim_scores = sim_scores[1:21] # 30 najslicnijih filmova 
    movie_indices = [i[0] for i in sim_scores]
    return pd.DataFrame(titles.iloc[movie_indices])

In [18]:
get_recommendations('Toy Story (1995)')

Unnamed: 0,title
1706,Antz (1998)
2355,Toy Story 2 (1999)
2809,"Adventures of Rocky and Bullwinkle, The (2000)"
3000,"Emperor's New Groove, The (2000)"
3568,"Monsters, Inc. (2001)"
6194,"Wild, The (2006)"
6486,Shrek the Third (2007)
6948,"Tale of Despereaux, The (2008)"
7760,Asterix and the Vikings (Astérix et les Viking...
8219,Turbo (2013)


In [19]:
get_recommendations('It Takes Two (1995)')

Unnamed: 0,title
49,"Big Green, The (1995)"
78,Dunston Checks In (1996)
214,Heavyweights (Heavy Weights) (1995)
332,Richie Rich (1994)
497,"Little Rascals, The (1994)"
504,Home Alone (1990)
629,Harriet the Spy (1996)
652,House Arrest (1996)
669,First Kid (1996)
763,D3: The Mighty Ducks (1996)
