# Base de dados MovieLens

In [1]:
import pandas as pd 
import numpy as np

In [2]:
filmes = []
for linha in open("data/ml-100k/u.item", encoding = "ISO-8859-1"):
    (id, titulo) = linha.split('|')[0:2]
    filmes.append({
        "id": id,
        "titulo": titulo
    })

df_filmes = pd.DataFrame(filmes)
print(df_filmes.shape)
df_filmes.head()

(1682, 2)


Unnamed: 0,id,titulo
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [3]:
base = []
for linha in open("data/ml-100k/u.data", encoding = "ISO-8859-1"):
    (usuario, id_filme, avaliacao, tempo) = linha.split("\t")
    base.append({
        "usuario": usuario,
        "id_filme": id_filme,
        "avaliacao": avaliacao
    })

df_aval = pd.DataFrame(base)
print(df_aval.shape)
df_aval.head()


(100000, 3)


Unnamed: 0,usuario,id_filme,avaliacao
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [32]:
df = df_filmes.merge(df_aval, left_on="id", right_on="id_filme", how="right").drop(["id", "id_filme"], axis=1)
df = df.astype({"avaliacao": "float"})

print(df.shape)
df.head()

(100000, 3)


Unnamed: 0,titulo,usuario,avaliacao
0,Kolya (1996),196,3.0
1,L.A. Confidential (1997),186,3.0
2,Heavyweights (1994),22,1.0
3,Legends of the Fall (1994),244,2.0
4,Jackie Brown (1997),166,1.0


In [33]:
df_base = df.drop_duplicates(subset=["usuario", "titulo"])

df_base = df_base.pivot(index="usuario", columns="titulo", values="avaliacao").reset_index()
print(df_base.shape)
df_base.head()

(943, 1665)


titulo,usuario,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
0,1,,,2.0,5.0,,,3.0,4.0,,...,,,,5.0,3.0,,,,4.0,
1,10,,,,5.0,,,,5.0,,...,,,,,,,,,,
2,100,,,,,,,,,,...,,,,,,,,,,
3,101,,,3.0,,,,,,,...,,,,,,,,,,
4,102,,,,,,,,,,...,,,,4.0,,,,,,


In [62]:
def distancia_euclidiana(x_i: np.array, y_i: np.array) -> float:
    return (sum((x_i - y_i)**2))**(0.5)

def similaridade(x_i: np.array, y_i: np.array) -> float:
    return 1/(1+ distancia_euclidiana(x_i, y_i))

def similaridade_df(item_1: str, item_2: str, coluna: str, base: pd.DataFrame) -> float:
    df = base.loc[base[coluna].isin([item_1, item_2])]
    df = df.dropna(axis=1)
    
    x_i = df.loc[df[coluna]==item_1].drop(coluna, axis=1).values[0]
    y_i = df.loc[df[coluna]==item_2].drop(coluna, axis=1).values[0]

    return similaridade(x_i, y_i)

def similaridade_por_item(item: str, coluna: str, base: pd.DataFrame, limite: float = 0.6) -> float:
    similaridades = {
            item_2: similaridade_df(item, item_2, coluna, base) 
            for item_2 in base[base[coluna]!=item][coluna].unique()
    }
    return {item: value for item, value in similaridades.items() if value >= limite}

In [63]:
similaridade_por_item("1", "usuario", df_base)

{'155': 1.0, '418': 1.0, '812': 1.0}

In [65]:
similaridade_por_item("101 Dalmatians (1996)", "titulo", df_base.set_index("usuario").T.reset_index(), limite=0.9)

{'8 Heads in a Duffel Bag (1997)': 1.0,
 'A Chef in Love (1996)': 1.0,
 'Aiqing wansui (1994)': 1.0,
 'All Over Me (1997)': 1.0,
 'All Things Fair (1996)': 1.0,
 'American Dream (1990)': 1.0,
 'Angel Baby (1995)': 1.0,
 'Angel on My Shoulder (1946)': 1.0,
 'Angus (1995)': 1.0,
 'Anna (1996)': 1.0,
 'Apostle, The (1997)': 1.0,
 'Awfully Big Adventure, An (1995)': 1.0,
 'Ayn Rand: A Sense of Life (1997)': 1.0,
 'B*A*P*S (1997)': 1.0,
 'B. Monkey (1998)': 1.0,
 'Bad Girls (1994)': 1.0,
 'Band Wagon, The (1953)': 1.0,
 'Baton Rouge (1988)': 1.0,
 'Beans of Egypt, Maine, The (1994)': 1.0,
 'Before the Rain (Pred dozhdot) (1994)': 1.0,
 'Best Men (1997)': 1.0,
 'Best of the Best 3: No Turning Back (1995)': 1.0,
 'Bewegte Mann, Der (1994)': 1.0,
 'Bhaji on the Beach (1993)': 1.0,
 'Big Bang Theory, The (1994)': 1.0,
 'Big One, The (1997)': 1.0,
 'Bitter Sugar (Azucar Amargo) (1996)': 1.0,
 "Blood For Dracula (Andy Warhol's Dracula) (1974)": 1.0,
 'Boxing Helena (1993)': 1.0,
 "Boy's Life 2 (1