In [1]:
##################################
#
#  name : lixinsong
#  
#  date : 2018-06-30 
#
###################################

In [2]:
# 加载数据 

def load_movielens(path='./ml-100k'):
    # get movie titles
    movies = {}
    for line in open(path + '/u.item', encoding='latin-1'):
        id, title = line.split('|')[0:2]
        movies[id] = title
    # load data
    prefs = {}
    for line in open(path + '/u.data', encoding='latin-1'):
        user, movieid, rating, ts = line.split('\t')
        prefs.setdefault(user, {})
        prefs[user][movies[movieid]] = float(rating)
    return prefs

In [3]:
prefs = load_movielens()
print(prefs['87'])

{'Naked Gun 33 1/3: The Final Insult (1994)': 4.0, 'Con Air (1997)': 4.0, 'Sabrina (1995)': 4.0, 'Waterworld (1995)': 4.0, 'To Wong Foo, Thanks for Everything! Julie Newmar (1995)': 3.0, 'Clueless (1995)': 4.0, 'Jurassic Park (1993)': 5.0, 'Brady Bunch Movie, The (1995)': 2.0, 'Son in Law (1993)': 4.0, 'Indiana Jones and the Last Crusade (1989)': 5.0, 'Good, The Bad and The Ugly, The (1966)': 5.0, 'Dead Poets Society (1989)': 5.0, 'Dead Man Walking (1995)': 4.0, "Joe's Apartment (1996)": 2.0, 'GoldenEye (1995)': 4.0, 'M*A*S*H (1970)': 5.0, 'Something to Talk About (1995)': 2.0, 'Lightning Jack (1994)': 3.0, 'Big Green, The (1995)': 3.0, 'Cowboy Way, The (1994)': 3.0, "Ulee's Gold (1997)": 3.0, 'Addams Family Values (1993)': 2.0, '2001: A Space Odyssey (1968)': 5.0, 'Platoon (1986)': 3.0, 'Return of the Pink Panther, The (1974)': 4.0, 'Four Weddings and a Funeral (1994)': 5.0, 'Under Siege (1992)': 4.0, 'Ace Ventura: Pet Detective (1994)': 4.0, 'Die Hard: With a Vengeance (1995)': 4.0, 

In [4]:
# 划分数据集 

def split_data(data, M, k, seed):
    test = []
    train = []
    random.seed(seed)
    for user, item in data:
        if random.randint(0, M) == k:
            test.append([user, item])
        else:
            train.append([user, item])
    return train, test

In [5]:
# 计算召回率
def recall(train, test, N, GetRecommendation=None):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit / (all * 1.0)

In [6]:
# 计算准确率 
def precision(train, test, N, GetRecommendation=None):
    hit = 0
    all = 0
    for user in train.keys():
        tu = test[user]
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            if item in tu:
                hit += 1
        all += N
    return hit / (all * 1.0)

In [7]:
# 计算覆盖率 
def coverage(train, test, N, GetRecommendation=None):
    recommend_items = set()
    all_items = set()
    for user in train.keys():
        for item in train[user].kesy():
            all_items.add(item)
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            recommend_items.add(item)
    return len(recommend_items) / (len(all_items) * 1.0)

In [8]:
# 计算流行度 
def popularity(train, test, N, GetRecommendation=None):
    item_popularity = dict()
    for user, items in train.items():
        for item in items.keys():
            if item not in item_popularity:
                item_popularity[item] = 0
            item_popularity[item] += 1
    ret = 0
    n = 0
    for user in train.keys():
        rank = GetRecommendation(user, N)
        for item, pui in rank:
            ret += math.log(1 + item_popularity[item])
            n += 1
    ret /= n * 1.0
    return ret

In [9]:
# 生成字典 

def FindRelativeDict(train):
    movie2user = {}
    moviepopularity ={}
    for user,movies in train.items():
        for movie in movies:
            if movie not in movie2user.keys():
                movie2user[movie] = set()
            movie2user[movie].add(user)
            if movie not in moviepopularity.keys():
                moviepopularity[movie] = 0
            # popularity 定义为电影被所有用户给过正反馈的次数
            moviepopularity[movie] += 1
    return movie2user,moviepopularity

In [10]:
# 计算用户相似性字典

def FindUserSimilarity(trainset,movie2user):
    usersim = {}
    for movie,users in movie2user.items():
        for u in users:
            usersim.setdefault(u,defaultdict(int))
            for v in users:
                if u==v:
                    continue
                else:
                    usersim[u][v] +=1
    index=0
    for u,simiusers in usersim.items():
        for v,count in simiusers.items():
            index+=1
            if index % 300000 == 0:
                print("calculating...." + str(index))
            usersim[u][v] = count / math.sqrt(
                    len(trainset[u]) * len(trainset[v]))
    return usersim

In [11]:
# 推荐用户

def recommend(usersim,trainset,user,K,N):
    userrec = {}
    currentmovies = trainset[user]
    for similar_user, similarity_factor in sorted(usersim[user].items(),
                                                  key=itemgetter(1), reverse=True)[0:K]:
        for movie in trainset[similar_user]:
            if movie in currentmovies:
                continue
            userrec.setdefault(movie,0)
            userrec[movie] += similarity_factor
    recommendres = sorted(userrec.items(), key=itemgetter(1), reverse=True)[0:N]
    return recommendres

In [None]:
def main():
    prefs = load_movielens()
    train, test = split_data(prefs, 8, 1, 1)
    
    movie2user,moviepopularity = FindRelativeDict(train)
    usersim = FindUserSimilarity(train, movie2user)
    
    for user in prefs.keys():
        recommend(usersim, train, user, 10, 80)
        recall = recall(train, test, 80, recommend)
        popularity = popularity(train, test, 80, recommend) 
        coverage = coverage(train, test, 80, recommend) 
        
        print('recall: ', recall, '\n')
        print('precision: ', precision, '\n')
        print('Popularity: ', popularity, '\n')
        print('coverage: ', coverage, '\n')