In [1]:
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
import sys
from itertools import combinations, groupby
from collections import Counter
from IPython.display import display
from efficient_apriori import apriori

## Dataset de Películas:

In [2]:
Peliculas = pd.read_csv("C:/Users/Macarena/Desktop/Nueva carpeta/movies.csv")
Peliculas.sample(5)

Unnamed: 0,movieId,title,genres
10069,33226,Zatoichi Meets the One-Armed Swordsman (Shin z...,Action|Adventure|Drama
21516,104372,"Cry in the Night, A (1956)",Crime|Drama|Film-Noir
24037,114028,Pride (2014),Comedy|Drama
23512,111913,Lilting (2014),Drama
23832,113220,"Dog, The (2013)",Documentary


Separamos los datos que vienen en una única columna pero podrían requerirse analizar por separado, como son el Año del Título.

In [3]:
Peliculas_temp1 = []
Peliculas_temp2 = []
for row in Peliculas["title"]:
    temp1 = len(row)
    Peliculas_temp1.append(row[:temp1 - 7])
    Peliculas_temp2.append(row[temp1 - 5:temp1 - 1])
Peliculas["title"] = Peliculas_temp1
Peliculas["year"] = Peliculas_temp2
Peliculas.sample(5)

Unnamed: 0,movieId,title,genres,year
11973,53999,Captivity,Crime|Thriller,2007
10317,34583,Prime Cut,Action|Crime|Drama,1972
16509,83381,Seven Thieves,Crime|Drama,1960
2962,3049,How I Won the War,Comedy|War,1967
5096,5193,"Jazz Singer, The",Musical,1980


En caso del Género, existen valores que en si no representan un género real y decidimos reemplazarlos por Null, estos son (no genres listed) e IMAX. Además los valores los convertimos en columnas binarias, identificando si aplica o no al género en cuestión.

In [4]:
lb_2 = MultiLabelBinarizer()
Peliculas=Peliculas.join(pd.DataFrame(lb_2.fit_transform(Peliculas.genres.str.split("|")), columns=lb_2.classes_, index=Peliculas.index))
pd.concat([Peliculas["genres"],Peliculas[Peliculas.columns[3:]]],axis=1).drop_duplicates().sample(5)

Unnamed: 0,genres,year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
11314,Animation|Children|Comedy|Musical,1972,0,0,0,1,1,1,0,0,...,0,0,0,1,0,0,0,0,0,0
24208,Horror,1969,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1179,Adventure|Drama|War,1962,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1067,Crime|Mystery|Thriller,1992,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
4215,Action|Drama|Romance|War,2001,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0


In [5]:
Peliculas = Peliculas.drop('genres', 1)
Peliculas = Peliculas.drop('(no genres listed)', 1)
Peliculas = Peliculas.drop('IMAX', 1)
Peliculas.sample(5)

Unnamed: 0,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
14192,71184,Valentino: The Last Emperor,2008,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
19644,97186,Sinivalkoinen valhe,2012,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6345,6455,North to Alaska,1960,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
24084,114197,Noam Chomsky: Distorted Morality,2003,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
336,340,"War, The",1994,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


Tamaño del dataset:

In [6]:
Peliculas.shape

(27278, 21)

## Dataset de Calificaciones:

In [7]:
Calificaciones = pd.read_csv("C:/Users/Macarena/Desktop/Nueva carpeta/ratings.csv")
Calificaciones.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
14186988,97953,5956,3.5,1240520934
8813305,60919,457,4.0,1146322756
12598057,87039,920,1.5,1054230280
9521436,65872,109487,4.0,1415733708
18769594,129995,4084,3.0,990480599


Consideraremos solo aquellas películas con buena calificación ya que creemos que tomarse la molestia de hacer una mala calificacion puede interpretarse como que la película disgustó más aun que si no se la hubiese calificado con ningún valor.

In [8]:
#print('Mínimo rating: ',Calificaciones_min["rating"].min())
#print('Máximo rating: ',Calificaciones_min["rating"].max())

In [9]:
#Calificaciones_min = Calificaciones_min[Calificaciones_min["rating"] >= 4]
#Comentado por falta de recursos en mi computadora personal.

Tanto la variable de tiempo como el rating no son relevantes para el análisis de ahora en más.

In [10]:
Calificaciones = Calificaciones.drop('rating', 1)
Calificaciones = Calificaciones.drop('timestamp', 1)
Calificaciones.sample(5)

Unnamed: 0,userId,movieId
9172167,63381,185
8856220,61187,457
10071156,69664,1222
15221992,105216,16
13888065,95949,3812


Tamaño del dataset:

In [11]:
Calificaciones.shape

(20000263, 2)

In [12]:
Calificaciones['userId'].unique().shape

(138493,)

In [13]:
Calificaciones['movieId'].unique().shape

(26744,)

In [14]:
Calificaciones.groupby('movieId').size().sort_values(ascending=False)

movieId
296       67310
356       66172
318       63366
593       63299
480       59715
260       54502
110       53769
589       52244
2571      51334
527       50054
1         49695
457       49581
150       47777
780       47048
50        47006
1210      46839
592       46054
1196      45313
2858      44987
32        44980
590       44208
1198      43295
608       43272
47        43249
380       43159
588       41842
377       41562
1270      41426
858       41355
2959      40106
          ...  
110006        1
84691         1
84689         1
110032        1
110034        1
110042        1
110046        1
84680         1
110052        1
110056        1
110061        1
110063        1
110070        1
110086        1
110090        1
110097        1
84660         1
110114        1
84565         1
110134        1
84540         1
110140        1
84538         1
110163        1
110167        1
84534         1
84500         1
84442         1
84436         1
131262        1
Length: 26744, d

## Reglas de Asociación entre Películas (acotado):

In [15]:
Calificaciones_Peliculas = pd.merge(Calificaciones[['userId','movieId']], Peliculas[['movieId','title']] ,on='movieId', how= "inner")
Calificaciones_Peliculas.shape

(20000263, 3)

Calificaciones_Peliculas tiene la misma cantidad de filas que Calificaciones, es decir que todas las películas calificadas existen en Peliculas.

In [16]:
Calificaciones_Peliculas_Min = Calificaciones_Peliculas.sample(1000)

In [17]:
Calificaciones_Peliculas_Min = Calificaciones_Peliculas_Min.sort_values( by=['userId','movieId'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
Calificaciones_Peliculas_Min_Array = Calificaciones_Peliculas_Min.values[:,[0,2]]
print(Calificaciones_Peliculas_Min_Array)

[[102 "Amelie (Fabuleux destin d'Amélie Poulain, Le)"]
 [387 'Willard']
 [394 'Goldfinger']
 ...
 [138162 'Clockwatchers']
 [138225 'Rushmore']
 [138417 'Paper Chase, The']]


In [18]:
transactions=[]
for userId, title in groupby(Calificaciones_Peliculas_Min_Array, lambda x: x[0]):
    transactions.append([item[1] for item in title])
print(transactions)

[["Amelie (Fabuleux destin d'Amélie Poulain, Le)"], ['Willard'], ['Goldfinger'], ['High Noon'], ['Working Girl'], ["St. Elmo's Fire"], ['Spider-Man 2'], ['Pirates of the Caribbean: On Stranger Tides'], ['Exit to Eden'], ['Tomorrow Never Dies'], ['Force 10 from Navarone'], ['What a Girl Wants'], ['Sleepless in Seattle'], ['Winged Migration (Peuple migrateur, Le)'], ['Dogville'], ['Station Agent, The'], ["Something's Gotta Give"], ['Leaving Las Vegas'], ['Leaving Las Vegas'], ['Forrest Gump'], ["Boys Don't Cry"], ['Silence of the Lambs, The'], ['Quest, The'], ['Roxanne'], ["Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone)"], ['Mighty Morphin Power Rangers: The Movie'], ['Pay It Forward', 'Gangs of New York'], ['Judgment Night'], ['Silence of the Lambs, The'], ['Madagascar'], ['101 Dalmatians'], ['Exam'], ['Charade'], ['Hoop Dreams'], ['Star Wars: Episode IV - A New Hope'], ['Beverly Hills Cop II'], ['Seven Samurai (Shichinin no samurai)'], ['Road to

In [19]:
itemsets, rules = apriori(transactions, min_support=0.001,  min_confidence=0.001, max_length=2)

In [20]:
rules=sorted(rules, key=lambda rule: rule.confidence)
for rule in rules:
    print(rule)

{Galaxy Quest} -> {Steal This Film} (conf: 0.333, supp: 0.001, lift: 328.667, conv: 1.498)
{Under Siege 2: Dark Territory} -> {Bushwhacked} (conf: 0.500, supp: 0.001, lift: 246.500, conv: 1.996)
{Bushwhacked} -> {Under Siege 2: Dark Territory} (conf: 0.500, supp: 0.001, lift: 246.500, conv: 1.996)
{Butch Cassidy and the Sundance Kid} -> {Name of the Rose, The (Name der Rose, Der)} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Cold Mountain} -> {Tora! Tora! Tora!} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Conversation, The} -> {School of Rock} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Who Framed Roger Rabbit?} -> {Crank} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Terms of Endearment} -> {Like Water for Chocolate (Como agua para chocolate)} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Mask, The} -> {Total Recall} (conf: 0.500, supp: 0.001, lift: 493.000, conv: 1.998)
{Monsters, Inc.} -> {Priceless (Hors de prix)} (conf: 0.500,

In [21]:
np.shape(rules)

(28,)

## Ejemplos citados en Resumen:

Ejemplo 1

In [29]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['title'] == 'Matrix Reloaded, The']

Unnamed: 0,userId,movieId,title
9032551,45300,6365,"Matrix Reloaded, The"


In [30]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['userId'] == 45300]

Unnamed: 0,userId,movieId,title
7422341,45300,153,Batman Forever
9032551,45300,6365,"Matrix Reloaded, The"


Ejemplo 2

In [31]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['title'] == 'Galaxy Quest']

Unnamed: 0,userId,movieId,title
6889833,38655,3175,Galaxy Quest
6891564,55594,3175,Galaxy Quest
6895059,90652,3175,Galaxy Quest


In [32]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['userId'] == 38655]

Unnamed: 0,userId,movieId,title
6889833,38655,3175,Galaxy Quest
19964056,38655,74512,Steal This Film


In [33]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['userId'] == 55594]

Unnamed: 0,userId,movieId,title
6891564,55594,3175,Galaxy Quest


In [34]:
Calificaciones_Peliculas_Min[Calificaciones_Peliculas_Min['userId'] == 90652]

Unnamed: 0,userId,movieId,title
6895059,90652,3175,Galaxy Quest
