In [19]:
from sentence_transformers import SentenceTransformer

In [21]:
from ast import literal_eval

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
md = pd.read_csv('./data2/movies_metadata.csv') # dataset do tmdb com os filmes que existem tanto nele, quanto no imdb

#### Preprocess

In [None]:
md.drop(['adult', 'homepage', 'imdb_id', 'poster_path', 'video'] , axis=1, inplace=True)
md[md['vote_average'].isna() == True]
md.drop(md[md['vote_average'].isna() == True].index, axis=0, inplace=True)
md['belongs_to_collection'] = md['belongs_to_collection'].notna().astype('int')
md['genres'] = md['genres'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['production_companies'] = md['production_companies'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['production_countries'] = md['production_countries'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
md['spoken_languages'] = md['spoken_languages'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

#### Feature Engineer

In [None]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').dt.year.convert_dtypes()
md = md[md['status'] == 'Released']
del md['status']

In [None]:
m = round(md['vote_count'].quantile(0.95))  # dica: pegar so filmes com vote_count quantile > 95 (so filmes mainstream)
C = round(md['vote_average'].mean(), 1)     # isso fara nosso sistema de rec recomendar só filme mainstream tbm
    
df = md[(md['vote_count'] >= m)][['id', 'title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres', 'overview']]

df.shape

In [None]:
# features para ajudar > recomendacoes baseadas em keywords e elenco (creditos) daquele filme
credits = pd.read_csv('./data2/credits.csv')
keywords = pd.read_csv('./data2/keywords.csv')

credits.drop_duplicates(inplace = True)
keywords.drop_duplicates(inplace = True)

In [None]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
df['id'] = df['id'].astype('int')

In [None]:
df = df.merge(credits, on='id')
df = df.merge(keywords, on='id')

In [None]:
df['cast'] = df['cast'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['cast'] = df['cast'].apply(lambda x: x[:5] if len(x) >=5 else x)

In [None]:
df['keywords'] = df['keywords'].apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [None]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

df['director'] = df['crew'].apply(literal_eval).apply(get_director)

In [None]:
# Removing space between names in order to differentiate between different actors and directors

df['cast'] = df['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])
df['director'] = df['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")))

In [None]:
# Changing the datatype to list to enable concatenation
df['director'] = df['director'].apply(lambda x: [i for i in x.split()])

# Creating a new column by concatenating all the relavant metadata info
df['metadata'] = df['genres'] + df['keywords'] + df['director'] + df['cast']

# Taking all the items in the metadata and joining it into a single string
df['metadata'] = df['metadata'].apply(lambda x: ' '.join(x))

In [None]:
df.to_excel("check.xlsx")

In [None]:
df = pd.read_excel("check.xlsx")

In [None]:
filter = pd.read_excel("filtro.xlsx")

In [None]:
df = df[df['id'].isin(filter['tmdbId'])]

In [None]:
df.to_excel("check_filtrado.xlsx")

In [25]:
df = pd.read_excel("check.xlsx")

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
meta_data = np.array(df['metadata'])
embeddings = model.encode(meta_data, show_progress_bar=True)

In [None]:
model.get_sentence_features

In [None]:
# Almost 1/4th of our data (531/2255) would get truncated as it exceeds the max_seq_length of 384. But it shouldn't affect the performance too much.

i = [len(i) for i in meta_data if len(i) > 384]
len(i)

In [None]:
embeddings.shape

In [None]:
cos_sim_data = pd.DataFrame(cosine_similarity(embeddings))

In [None]:
cos_sim_data.to_excel("testin.xlsx")

In [27]:
cos_sim_data = pd.read_excel("recomendacao_baseada_conteudo.xlsx", index_col=None)

In [28]:
cos_sim_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2135,2136,2137,2138,2139,2140,2141,2142,2143,2144
0,1.000000,0.443138,0.306202,0.349099,0.336794,0.411648,0.430683,0.209855,0.568433,0.514153,...,0.663295,0.458868,0.449840,0.657198,0.348617,0.397352,0.313726,0.229971,0.309340,0.235547
1,0.443138,1.000000,0.146537,0.282225,0.233747,0.210121,0.358467,0.317471,0.381289,0.227965,...,0.346468,0.301343,0.401625,0.402120,0.309591,0.248554,0.330236,0.180014,0.256059,0.322622
2,0.306202,0.146537,1.000000,0.558415,0.544924,0.627537,0.456840,0.550343,0.343313,0.653832,...,0.274767,0.711004,0.380176,0.313746,0.253683,0.597950,0.545131,0.464617,0.515190,0.405568
3,0.349099,0.282225,0.558415,1.000000,0.465062,0.522111,0.383814,0.531676,0.329460,0.512575,...,0.266467,0.636415,0.335411,0.319669,0.351736,0.499941,0.543272,0.476438,0.568454,0.384297
4,0.336794,0.233747,0.544924,0.465062,1.000000,0.550559,0.432289,0.501119,0.436745,0.593858,...,0.268820,0.630556,0.385940,0.302412,0.239045,0.291623,0.507804,0.262136,0.442659,0.323082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,0.397352,0.248554,0.597950,0.499941,0.291623,0.466398,0.483404,0.468744,0.322272,0.584982,...,0.450613,0.513206,0.427414,0.466847,0.367954,1.000000,0.600093,0.469436,0.496777,0.352970
2141,0.313726,0.330236,0.545131,0.543272,0.507804,0.453732,0.488862,0.681880,0.437936,0.531326,...,0.303572,0.523830,0.456676,0.361716,0.437428,0.600093,1.000000,0.514685,0.534820,0.516677
2142,0.229971,0.180014,0.464617,0.476438,0.262136,0.395150,0.454032,0.405310,0.290575,0.360317,...,0.236410,0.362932,0.305096,0.277857,0.314155,0.469436,0.514685,1.000000,0.509326,0.335404
2143,0.309340,0.256059,0.515190,0.568454,0.442659,0.391020,0.398346,0.399412,0.387681,0.475763,...,0.293892,0.491145,0.370874,0.268330,0.458761,0.496777,0.534820,0.509326,1.000000,0.347214


In [29]:
df = df.reset_index()
df['title'] = df['title'] + ' (' + df['year'].astype(str) + ')'
data = df[['title','genres','year']]
indices = pd.Series(df.index, index=df['title'])

In [30]:
def get_recommendations(title, N = 30):
    idx = indices[title]
    #print(f"opa {idx}")
    sim_scores = list(enumerate(cos_sim_data[idx]))
    #print(sim_scores)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    #print(sim_scores)
    sim_scores = sim_scores[1:N+1]
    #print(sim_scores)
    movie_indices = [i[0] for i in sim_scores]
    #print(f"movie_indices: {movie_indices}")
    sim_scores = pd.DataFrame(sim_scores, columns = ['index', 'similarity_score']) 
    final_data = data.iloc[movie_indices]
    final_data = final_data.merge(sim_scores, left_index = True, right_on ='index')
    final_data['similarity_score'] = round(final_data['similarity_score']*100,2)
    del final_data['index']
    return final_data

In [40]:
get_recommendations('Toy Story (1995)', 50)

Unnamed: 0,title,genres,year,similarity_score
0,Babel (2006),['Drama'],2006,80.59
1,The Tree of Life (2011),"['Drama', 'Fantasy']",2011,78.85
2,"Monsters, Inc. (2001)","['Animation', 'Comedy', 'Family']",2001,78.22
3,Blue Valentine (2010),"['Drama', 'Romance']",2010,77.43
4,12 Years a Slave (2013),"['Drama', 'History']",2013,76.6
5,Tammy (2014),['Comedy'],2014,76.35
6,Kung Fury (2015),"['Action', 'Comedy', 'Science Fiction', 'Fanta...",2015,76.12
7,Stuart Little 2 (2002),"['Family', 'Adventure', 'Animation', 'Comedy']",2002,74.99
8,Bride Wars (2009),['Comedy'],2009,74.63
9,Quantum of Solace (2008),"['Adventure', 'Action', 'Thriller', 'Crime']",2008,74.51
