# **Este es un software de recomendación de peliculas**





In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer


**Importamos los datos**

In [7]:
#importamos los datos
from google.colab import files
uploaded = files.upload()

Saving movie-data.csv to movie-data.csv


**Analizamos la tabla y agregamos la columna de id (0,1,2...)**

In [8]:
df = pd.read_csv('movie-data.csv')
df['Movie_id'] = range(0,1000)
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2


In [9]:
#llevar una cuneta de el numero de peliculas en el data set y el numero de columnas
df.shape

(1000, 13)

In [10]:
#crear una lista para las columnas importantes que seran usadas
columns = ['Actors', 'Director', 'Genre', 'Title']


In [11]:
#mostramos los datos
df[columns].head(3)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split


In [12]:
#revisar si hay algun valor faltante en las columnas importantes
df[columns].isnull().values.any()

False

In [13]:
#crear una funcion para combinar los valores de las columnas importantes en una solo string
def get_imortant_features(data):
  important_features = []
  for i in range(0,data.shape[0]):
    important_features.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Title'][i])
  return important_features

In [14]:
#crear una columna para sotener los string combinados
df['important_features'] = get_imortant_features(df)
#mostrar los datos
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."


In [40]:
#convertir el texto a una matrix de token counts
cm = CountVectorizer().fit_transform(df['important_features'])


In [16]:
#conseguir el coseno de similaridad matrix de count matrix
cs = cosine_similarity(cm)
#imprimir el coseno de similaridad
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [17]:
#conseguir la forma de la cosine similarity matrix
cs.shape

(1000, 1000)

In [61]:
#conseguir el titulo de la pelicula que el usuario quiere
title = 'Guardians of the Galaxy'
#encontrar el id de la pelicula
movie_id = df[df.Title == title]['Movie_id'].values[0] # el valor es 0 porque asi conseguimos su numero no el nombre
print(movie_id)

0


In [62]:
#crear una lista de numeraciones para el puntaje de similaridad[(move_id, similarity_score), (...)]
scores = list(enumerate(cs[movie_id]))
print(scores)

[(0, 1.0000000000000002), (1, 0.1767766952966369), (2, 0.06085806194501846), (3, 0.0), (4, 0.12171612389003691), (5, 0.1767766952966369), (6, 0.0), (7, 0.0), (8, 0.2858309752375148), (9, 0.18898223650461363), (10, 0.052704627669472995), (11, 0.0), (12, 0.23570226039551587), (13, 0.06299407883487121), (14, 0.06085806194501846), (15, 0.2286647801900118), (16, 0.0), (17, 0.06085806194501846), (18, 0.0), (19, 0.12171612389003691), (20, 0.06085806194501846), (21, 0.06085806194501846), (22, 0.05892556509887897), (23, 0.06085806194501846), (24, 0.2286647801900118), (25, 0.0), (26, 0.18898223650461363), (27, 0.0), (28, 0.0), (29, 0.12171612389003691), (30, 0.12598815766974242), (31, 0.0), (32, 0.2946278254943948), (33, 0.12171612389003691), (34, 0.21629522817435004), (35, 0.27036903521793754), (36, 0.18257418583505536), (37, 0.1143323900950059), (38, 0.2946278254943948), (39, 0.0), (40, 0.06085806194501846), (41, 0.0), (42, 0.05892556509887897), (43, 0.05407380704358751), (44, 0.0), (45, 0.210

In [63]:
#ordenamos la lista
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [64]:
#imprimir los sorted scores
print(sorted_scores)

[(48, 0.4001633653325207), (362, 0.4001633653325207), (257, 0.3771236166328254), (94, 0.3600411499115478), (85, 0.34299717028501775), (388, 0.34299717028501775), (162, 0.32444284226152503), (195, 0.32444284226152503), (216, 0.32444284226152503), (710, 0.32444284226152503), (566, 0.31622776601683794), (822, 0.31622776601683794), (87, 0.314970394174356), (558, 0.3086066999241839), (140, 0.3042903097250923), (728, 0.3042903097250923), (944, 0.3042903097250923), (32, 0.2946278254943948), (38, 0.2946278254943948), (76, 0.2946278254943948), (205, 0.2946278254943948), (924, 0.2946278254943948), (8, 0.2858309752375148), (200, 0.2858309752375148), (396, 0.2858309752375148), (712, 0.2858309752375148), (852, 0.2858309752375148), (176, 0.28284271247461906), (126, 0.27777777777777785), (253, 0.27777777777777785), (316, 0.27777777777777785), (325, 0.27777777777777785), (384, 0.27777777777777785), (35, 0.27036903521793754), (60, 0.27036903521793754), (429, 0.27036903521793754), (408, 0.25717224993681

In [60]:
#crear un loop para imprimir las primeras 7 peliculas similares
j = 0
print(" the 7 most recomoended movies to ",title, 'are:')
for item in sorted_scores:
  movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j = j +1
  if j > 6:
    break

 the 7 most recomoended movies to  Split are:
1 Morgan
2 The Conjuring
3 The Visit
4 Victor Frankenstein
5 The VVitch: A New-England Folktale
6 Insidious
7 Mama
