In [6]:
import pandas as pd
import numpy as np

In [3]:
r_cols = ['user_id', 'movie_id', 'rating']
# importa somente as 3 primeiras colunas, separadas por tabulação
ratings = pd.read_csv('coding_files/u.data', sep='\t', names=r_cols, usecols=range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,0,50,5
1,0,172,5
2,0,133,1
3,196,242,3
4,186,302,3


In [4]:
# agrupa por movie_id e calcula a média e a contagem de ratings
movieProp = ratings.groupby('movie_id').agg({'rating': [pd.Series.mean, pd.Series.count]})
movieProp.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,mean,count
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2
1,3.878319,452
2,3.206107,131
3,3.033333,90
4,3.550239,209
5,3.302326,86


In [11]:
movieNumRatings = pd.DataFrame(movieProp['rating']['count'] )
# normaliza os valores de movieNumRatings
movieNormalizedNumRatings = movieNumRatings.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)))
movieNormalizedNumRatings.head() # popularidade de 0 a 1

Unnamed: 0_level_0,count
movie_id,Unnamed: 1_level_1
1,0.773585
2,0.222985
3,0.152659
4,0.356775
5,0.145798


In [16]:
movieDict = {}

with open('coding_files/u.item') as f:
    temp = ''
    
    # esse for é para cada linha do arquivo
    # cada linha é um filme, com várias informações separadas por '|'
    for line in f:
        fields = line.rstrip('\n').split('|')
        movieID = int(fields[0])
        name = fields[1]
        genres = fields[5:25]
        genres = map(int, genres)
        
        # genres teve que ser convertido para lista para ser possível usar np.array pois pela função map, genres é um objeto map
        # get('count') ao invés de 'size' pois não estava funcionando
        movieDict[movieID] = (name, np.array(list(genres)), movieNormalizedNumRatings.loc[movieID].get('count'), movieProp.loc[movieID].rating.get('mean'))

In [22]:
movieDict[123]
# no genres, onde for 1 é true, onde for 0 é false

('Frighteners, The (1996)',
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]),
 0.1955403087478559,
 3.234782608695652)

In [28]:
from scipy import spatial

def ComputeDistance(a, b):
    # calcula a distancia entre os vetores de dois generos
    genresA = a[1]
    genresB = b[1]
    genreDistance = spatial.distance.cosine(genresA, genresB)
    
    # calcula a distancia entre as popularidades
    popularityA = a[2]
    popularityB = b[2]
    popularityDistance = abs(popularityA - popularityB)
    
    # retorna a soma das duas distancias
    return genreDistance + popularityDistance

In [29]:
ComputeDistance(movieDict[2], movieDict[4])

0.8004574042309892

In [24]:
print(movieDict[2])
print(movieDict[4])

('GoldenEye (1995)', array([0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]), 0.22298456260720412, 3.2061068702290076)
('Get Shorty (1995)', array([0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]), 0.3567753001715266, 3.550239234449761)


In [27]:
import operator

# calcula a distancia entre o filme passado e todos os outros filmes
# ordena por distancia e retorna os K mais proximos
# retorna os K vizinhos mais proximos
def getNeighbors(movieID, K):
    distances = []
    
    for movie in movieDict:
        if (movie != movieID):
            dist = ComputeDistance(movieDict[movieID], movieDict[movie])
            distances.append((movie, dist)) # adiciona o id do filme e a distancia
    distances.sort(key=operator.itemgetter(1))  # ordena por distancia
    neighbors = []
    
    for x in range(K):
        neighbors.append(distances[x][0])
        
    return neighbors

In [30]:
K = 10
avgRating = 0
neighbors = getNeighbors(1, K)

for neighbor in neighbors: 
    avgRating += movieDict[neighbor][3]
    print(movieDict[neighbor][0] + " " + str(movieDict[neighbor][3]))

avgRating /= float(K)

Liar Liar (1997) 3.156701030927835
Aladdin (1992) 3.8127853881278537
Willy Wonka and the Chocolate Factory (1971) 3.6319018404907975
Monty Python and the Holy Grail (1974) 4.0664556962025316
Full Monty, The (1997) 3.926984126984127
George of the Jungle (1997) 2.685185185185185
Beavis and Butt-head Do America (1996) 2.7884615384615383
Birdcage, The (1996) 3.4436860068259385
Home Alone (1990) 3.0875912408759123
Aladdin and the King of Thieves (1996) 2.8461538461538463


In [31]:
avgRating

3.3445905900235564