# Trabajo Práctico: reglas de asociación

### Integrantes del grupo: Gonzalez Nehuen, Arja Adel, Madoery Pablo

In [52]:
import numpy as np
from efficient_apriori import apriori
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [53]:
movies = pd.read_csv("movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [54]:
df = pd.read_csv("ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [55]:
# si queremos trabajar con un dataset reducido en numero de users
#ids = list(range(1,51))
#df = df[df.userId.isin(ids)]
#df

In [56]:
transactions = df.groupby('userId')['movieId'].apply(list).values

In [61]:
itemsets, rules = apriori(transactions, min_support=0.2,  min_confidence=0.7)

## Métricas de reglas de asociación:
### más soporte: la regla se encuentra en más transacciones
### más confianza: mayor probabilidad de que la regla sea cierta para una transacción
### más lift: menor probabilidad de que la regla sea una casualidad
### más convicción: mayor grado de implicación, va de 1 a infinito (si la confianza es 1, la convicción es infinita)

In [62]:
# ordenamos las reglas de mayor a menor lift
rules=sorted(rules, key=lambda rule: rule.lift, reverse=True)
for rule in rules:
  print(rule)

{7153} -> {4993} (conf: 0.890, supp: 0.203, lift: 3.284, conv: 6.656)
{4993} -> {7153} (conf: 0.749, supp: 0.203, lift: 3.284, conv: 3.073)
{5952} -> {4993} (conf: 0.887, supp: 0.217, lift: 3.270, conv: 6.437)
{4993} -> {5952} (conf: 0.802, supp: 0.217, lift: 3.270, conv: 3.805)
{260, 1198} -> {1196} (conf: 0.898, supp: 0.214, lift: 2.746, conv: 6.623)
{260, 2571} -> {1196} (conf: 0.886, supp: 0.211, lift: 2.707, conv: 5.880)
{595} -> {588} (conf: 0.817, supp: 0.207, lift: 2.705, conv: 3.818)
{153} -> {592} (conf: 0.883, supp: 0.210, lift: 2.655, conv: 5.707)
{260, 1210} -> {1196} (conf: 0.842, supp: 0.238, lift: 2.573, conv: 4.255)
{1196} -> {260, 1210} (conf: 0.729, supp: 0.238, lift: 2.573, conv: 2.643)
{260, 1196} -> {1210} (conf: 0.849, supp: 0.238, lift: 2.509, conv: 4.374)
{1210} -> {260, 1196} (conf: 0.705, supp: 0.238, lift: 2.509, conv: 2.438)
{457, 480} -> {377} (conf: 0.738, supp: 0.202, lift: 2.460, conv: 2.675)
{260, 1196} -> {1198} (conf: 0.761, supp: 0.214, lift: 2.435,

In [63]:
# filtramos las reglas 1-1 y las imprimimos ordenadas por lift
filtered_rules = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(filtered_rules, key=lambda rule: rule.lift, reverse=True):
  print(rule)

{7153} -> {4993} (conf: 0.890, supp: 0.203, lift: 3.284, conv: 6.656)
{4993} -> {7153} (conf: 0.749, supp: 0.203, lift: 3.284, conv: 3.073)
{5952} -> {4993} (conf: 0.887, supp: 0.217, lift: 3.270, conv: 6.437)
{4993} -> {5952} (conf: 0.802, supp: 0.217, lift: 3.270, conv: 3.805)
{595} -> {588} (conf: 0.817, supp: 0.207, lift: 2.705, conv: 3.818)
{153} -> {592} (conf: 0.883, supp: 0.210, lift: 2.655, conv: 5.707)
{364} -> {588} (conf: 0.734, supp: 0.207, lift: 2.430, conv: 2.627)
{1210} -> {1196} (conf: 0.765, supp: 0.259, lift: 2.338, conv: 2.864)
{1196} -> {1210} (conf: 0.791, supp: 0.259, lift: 2.338, conv: 3.164)
{648} -> {780} (conf: 0.778, supp: 0.209, lift: 2.290, conv: 2.972)
{1198} -> {1196} (conf: 0.748, supp: 0.234, lift: 2.287, conv: 2.674)
{1196} -> {1198} (conf: 0.715, supp: 0.234, lift: 2.287, conv: 2.412)
{380} -> {592} (conf: 0.738, supp: 0.230, lift: 2.220, conv: 2.550)
{1196} -> {260} (conf: 0.859, supp: 0.281, lift: 2.182, conv: 4.291)
{260} -> {1196} (conf: 0.714, s

In [64]:
# filtramos las reglas 1-1 y las imprimimos ordenadas por lift y decodificadas
filtered_rules = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1, rules)
for rule in sorted(filtered_rules, key=lambda rule: rule.lift, reverse=True):
    m1 = movies[movies.movieId == rule.lhs].title.item()
    m2 = movies[movies.movieId == rule.rhs].title.item()
    print(m1, " -> ", m2)

Lord of the Rings: The Return of the King, The (2003)  ->  Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Fellowship of the Ring, The (2001)  ->  Lord of the Rings: The Return of the King, The (2003)
Lord of the Rings: The Two Towers, The (2002)  ->  Lord of the Rings: The Fellowship of the Ring, The (2001)
Lord of the Rings: The Fellowship of the Ring, The (2001)  ->  Lord of the Rings: The Two Towers, The (2002)
Beauty and the Beast (1991)  ->  Aladdin (1992)
Batman Forever (1995)  ->  Batman (1989)
Lion King, The (1994)  ->  Aladdin (1992)
Star Wars: Episode VI - Return of the Jedi (1983)  ->  Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode V - The Empire Strikes Back (1980)  ->  Star Wars: Episode VI - Return of the Jedi (1983)
Mission: Impossible (1996)  ->  Independence Day (a.k.a. ID4) (1996)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)  ->  Star Wars: Episode V - The Empire Strikes Back

In [77]:
# filtramos las reglas 1-1, con lift >= 1.7, y que tienen menos de dos palabras en común. 
# Las imprimimos ordenadas por lift y decodificadas.
filtered_rules = filter(lambda rule: len(rule.lhs) == 1 and len(rule.rhs) == 1 and (rule.lift) >= 1.7, rules)
for rule in sorted(filtered_rules, key=lambda rule: rule.lift, reverse=True):
    m1 = movies[movies.movieId == rule.lhs].title.item()
    m2 = movies[movies.movieId == rule.rhs].title.item()
    s1 = m1.split()
    s2 = m2.split()
    
    counts = 0
    for word in s1:
        if word in s2:
            counts = counts + 1
    
    if counts < 2:
        print(m1, " -> ", m2)

Beauty and the Beast (1991)  ->  Aladdin (1992)
Batman Forever (1995)  ->  Batman (1989)
Lion King, The (1994)  ->  Aladdin (1992)
Mission: Impossible (1996)  ->  Independence Day (a.k.a. ID4) (1996)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)  ->  Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode V - The Empire Strikes Back (1980)  ->  Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
True Lies (1994)  ->  Batman (1989)
Ace Ventura: Pet Detective (1994)  ->  Batman (1989)
Back to the Future (1985)  ->  Star Wars: Episode V - The Empire Strikes Back (1980)
Men in Black (a.k.a. MIB) (1997)  ->  Matrix, The (1999)
Speed (1994)  ->  Fugitive, The (1993)
True Lies (1994)  ->  Fugitive, The (1993)
Fight Club (1999)  ->  Matrix, The (1999)
Dances with Wolves (1990)  ->  Apollo 13 (1995)
Saving Private Ryan (1998)  ->  Matrix, The (1999)
True Lies (1994)  ->  Apollo 13 (1995)
Speed (1994)  ->  Terminator 2:

## Conclusiones

### Hemos elegido como soporte mínimo un valor de 20% y como confianza minima un valor de 70%.
### En los resultados, hemos observado que algunas recomendaciones contienen películas que forman parte de una misma saga. Debido a que estas recomendaciones no resultan interesantes hemos decidido filtrarlas.
### Además, nos hemos quedado solamente con las recomendaciones de cardinalidad 1-1 que superan un valor de lift de 1.7. Consideramos que este valor es capaz de remover las casualidades, es decir, aquellas peliculas que son vistas por la mayoria de las personas de cualquier forma.
### Finalmente, en caso de necesitar un mayor número de recomendaciones, es posible realizarlo disminuyendo el valor mínimo de soporte requerido o la confianza mínima requerida.