# Modelo de ML

In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
'''
Cargo el archivo deperudo en el ETL, solamente las columnas de id y titulo
'''
movies_ml = pd.read_csv("data/movies_depurado.csv", sep=",", usecols=['title', 'id'])
movies_ml.head()

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [3]:
''' 
controlo el tamaño
'''
movies_ml.shape

(45332, 2)

In [4]:
''' 
La cantidad de nulos
'''
movies_ml.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45332 entries, 0 to 45331
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      45332 non-null  int64 
 1   title   45332 non-null  object
dtypes: int64(1), object(1)
memory usage: 708.4+ KB


In [7]:
''' 
Elimino nulos
'''
movies_ml.dropna(inplace=True)
movies_ml.shape

(45332, 2)

In [6]:
''' 
Guardo por si necesito reiniciar el kernel
'''
movies_ml.to_csv(r'data/recomendacion_data.csv', index=False)

In [2]:
data = pd.read_csv("data/recomendacion_data.csv", sep=",")
data.head()

Unnamed: 0,id,title
0,862,Toy Story
1,8844,Jumanji
2,15602,Grumpier Old Men
3,31357,Waiting to Exhale
4,11862,Father of the Bride Part II


In [3]:
# se importa TfidfVectorizer

vec=TfidfVectorizer()

In [None]:

''' 
Todo el dataset es muy extenso para calcular la matriz de coseno, 
Obtengo un sample del 70% y lo guardo
'''
sample_data = data.sample(frac=0.7)
sample_data.to_csv(r'data/rec_sample_data.csv', index=False)

In [None]:
sample_data = pd.read_csv("data/rec_sample_data.csv", sep=",")

In [4]:

''' 
se crea la matriz de vectores
'''
vecs=vec.fit_transform(sample_data["title"].apply(lambda x: np.str_(x)))

In [5]:
vecs.shape

(31732, 18475)

In [6]:
''' 
Se calcula la matriz de similitud de coseno
'''
similarity =cosine_similarity(vecs) 

In [7]:
similarity.shape[0]

31732

In [None]:
''' 
Se ordenan por score y se elimina el primero que es el mismo
'''
scores=list(enumerate(similarity[1]))
sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)
sorted_scores=sorted_scores[1:]
movies=[sample_data.iloc[movie[0]]["title"] for movie in sorted_scores]
print(movies)


In [9]:
'''
funcion que devuelve la los titulo ordenados por similitud
'''
def recommend(id):    
    movie_idx=sample_data[sample_data["id"] == id].index.values[0]
    if movie_idx < similarity.shape[0]:
        scores=list(enumerate(similarity[movie_idx]))
        sorted_scores=sorted(scores,key=lambda x:x[1],reverse=True)
        sorted_scores=sorted_scores[1:6]
        movies=[sample_data.iloc[movie[0]]["title"] for movie in sorted_scores]
        return movies
    else:
        return []

In [10]:
''' 
Funcion que devueve los n mas similares
'''
def recommend_n(movie_list,n):
    first=[]
    count=0
    for movie in movie_list:
        if count > n-1:
            break
        count+=1
        first.append(movie)
    return first

In [11]:
print(recommend_n(recommend(282084),5))

['Doug Stanhope: No Place Like Home', "There's No Tomorrow", 'No Place on Earth', 'Road to Utopia', 'Plastic Utopia']


In [12]:
sample_data.head()

Unnamed: 0,id,title
0,44357,Seasons of the Year
1,43337,Inauguration of the Pleasure Dome
2,282084,There's No Place Like Utopia
3,17824,Private Resort
4,59115,Quarantine 2: Terminal


In [12]:
similarity[23576]

array([0., 0., 0., ..., 1., 1., 1.])

In [13]:
''' 
Para el servicio se creara una BD con los similares
'''
recom = []
for indice, fila in sample_data.iterrows():
    recom.append({'movie_id':fila.id,'recom':recommend(fila.id)})
    
df_recom = pd.DataFrame(recom)

df_recom.head()
 


Unnamed: 0,movie_id,recom
0,44357,"[Seasons, The Four Seasons, Three Seasons, Man..."
1,43337,"[Bio-Dome, A Day's Pleasure, The Pleasure Gard..."
2,282084,"[Doug Stanhope: No Place Like Home, There's No..."
3,17824,"[Last Resort, Private, Hot Resort, Wrong Turn ..."
4,59115,"[Terminal, The Terminal Man, Terminal Island, ..."


In [14]:
df_recom.shape

(31732, 2)

In [15]:
#guardar como csv
df_recom.to_csv(r'data/recomendacion_db.csv', index=False, sep=";")

In [16]:
''' 
pruebo la BD de recomendacion
'''
id_movie = 43337
df_recom2 = pd.read_csv("data/recomendacion_db.csv", sep=";",usecols=['movie_id','recom'])

# Filtrar el DataFrame por el movie_id especificado

df_recom2 = df_recom2[df_recom2["movie_id"] == id_movie]["recom"].values[0].strip('[]').replace('"',"'")
print({"recommend":df_recom2})


{'recommend': "'Bio-Dome', 'A Day's Pleasure', 'The Pleasure Garden', 'Pleasure Factory', 'Pleasure Cruise'"}
