# 电影推荐系统

In [239]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import re
import math
import random
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## 数据概览

本推荐系统使用的是MovieLens 100K 数据集。

数据集分为6个文件：电影评分 u.data,电影类别 u.genre,数据集概要 u.info,电影信息 u.info,职业列表 u.occupation,用户信息 u.user。

### 电影评分
文件里包含：用户id，电影id，评分（1-5），时间戳。

数据中的格式：UserID MovieID Rating。

In [4]:
ratings = pd.read_csv("./ml-100k/u.data", 
                      header=None, 
                      sep='\s', 
                      names=['UserID', 'MovieID', 'Rating','TimeStamp'], 
                      engine='python')
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 电影类别
文件里包含：genre，genreID。

数据中的格式：genre | genreID。

In [5]:
genres = pd.read_csv("./ml-100k/u.genre", 
                      header=None, 
                      sep='|', 
                      names=['Genre', 'GenreID'], 
                      engine='python')
genres

Unnamed: 0,Genre,GenreID
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


### 数据集概要
文件包含：用户数量，电影数量，评分数量。

共有943个用户参加评分，电影数量1682部， 100000条评分。

In [6]:
info = pd.read_csv("./ml-100k/u.info", 
                      header=None, 
                      sep='\s', 
                      names=['Number', 'Info'], 
                      engine='python')
info

Unnamed: 0,Number,Info
0,943,users
1,1682,items
2,100000,ratings


### 电影信息
文件包含：电影ID，电影名称（含年份），电影放映日期，imdb评分链接， 电影类别

数据中的格式：MovieID | MovieTitle | ReleaseDate | ImdbUrl | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy  | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western

PS. 需要整理成 mid mtitle date genre(list)的形式

In [7]:
movies_title= ['MovieID', 'MovieTitle', 'Date','Url', 'unknown', 
               'Action', 'Adventure', 'Animation', 'Children\'s\'',
               'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
               'Film-Noir', 'Horror', 'Musical', 'Mystery','Romance',
               'Sci-Fi', 'Thriller', 'War', 'Western']
lattr1 = [0, 1, 2, 4]
lattr2 = [i for i in range(5, 24)]

movies = pd.read_csv("./ml-100k/u.item", 
                      header=None, 
                      sep='|', 
                      usecols=lattr1+lattr2,
                      names=movies_title, 
                      engine='python')
movies.head()

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 职业列表
文件包含：多种职业

PS. 整理成 Oid(0-20)的形式

In [8]:
occupation = pd.read_csv("./ml-100k/u.occupation", 
                      header=None, 
                      sep='\s', 
                      names=['Occupation'], 
                      engine='python')
occupation

Unnamed: 0,Occupation
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


### 用户信息
文件包含：用户ID，用户年龄，用户性别，用户职业，用户邮编

数据中的格式：UserID | Age | Gender | Occupation | ZipCode

In [9]:
users = pd.read_csv("./ml-100k/u.user", 
                      header=None, 
                      sep='|', 
                      names=['UserID', 'Age', 'Gender', 'Occupation', 'ZipCode'], 
                      engine='python')
users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## 数据处理

- UserID, MovieID不变
- Occupation处理成ID形式（0-20）
- Genre
- title忽略
- Age
- Gender转变为0和1
- zipcode

- rating是target

## 处理评分信息
4-5分为1， 0-3为0

In [10]:
rating_map = {5:1, 4:1, 3:0, 2:0, 1:0}
ratings['Rating'] = ratings['Rating'].map(rating_map)
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
0,196,242,0,881250949
1,186,302,0,891717742
2,22,377,0,878887116
3,244,51,0,880606923
4,166,346,0,886397596


### 处理用户信息

In [11]:
# 将age值转化为连续的数字
age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
users['Age'] = users['Age'].map(age_map)

# 将zipcode值转化为连续的数字
zip_map = {val:ii for ii,val in enumerate(set(users['ZipCode']))}
users['ZipCode'] = users['ZipCode'].map(zip_map)

# 将F转化为0，M转化为1
if users['Gender'].dtype != 'int64':
    gender_map = {'F': 0, 'M': 1}
    users['Gender'] = users['Gender'].map(gender_map)

# 将各个Occupation转化为对应的ID
if users['Occupation'].dtype != 'int64':
    occupation_map = {}
    for index, row in occupation.iterrows():
        occupation_map[row[0].lower()] = index
    users['Occupation'] = users['Occupation'].map(occupation_map)

users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,14,1,19,761
1,2,43,0,13,16
2,3,13,1,20,349
3,4,14,1,19,241
4,5,23,0,13,395


### 处理电影信息

In [12]:
rating_map = {5: 1, 4: 1, 3: 0, 2: 0, 1: 0}
# movies['Rating'] = movies['Rating'].map(rating_map)
a = movies[(movies.MovieID == 3)]
b = movies[(movies.MovieID == 2)]
c = pd.concat([a, b])
c

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 数据集处理代码
创建用户相似矩阵，筛选出相似度最高的用户，挑选其中评分最高的电影。

In [13]:
class DataProcess:
    def __init__(self, dataFile):
        self.genreFile = "./ml-100k/u.genre"
        self.itemFile = "./ml-100k/u.item"
        self.userFile = "./ml-100k/u.user"
        self.infoFile = "./ml-100k/u.info"
        self.OcptFile = "./ml-100k/u.occupation"
        self.dataFile = dataFile

    def getRating(self):
        ratings = pd.read_csv(
            self.dataFile,
            header=None,
            sep='\s',
            names=['UserID', 'MovieID', 'Rating', 'TimeStamp'],
            engine='python')
        ratings = ratings[['UserID', 'MovieID', 'Rating']]
        return ratings
    
    def getLrRating(self):
        ratings = pd.read_csv(
            self.dataFile,
            header=None,
            sep='\s',
            names=['UserID', 'MovieID', 'Rating', 'TimeStamp'],
            engine='python')
        ratings = ratings[['UserID', 'MovieID', 'Rating']]    
#         ratings['Rating'] = ratings['Rating'].map({5:1, 4:1, 3:0, 2:0, 1:0})
        return ratings

    def getGenre(self):
        genres = pd.read_csv(self.genreFile,
                             header=None,
                             sep='|',
                             names=['Genre', 'GenreID'],
                             engine='python')
        return genres

    def getInfo(self):
        info = pd.read_csv(self.infoFile,
                           header=None,
                           sep='\s',
                           names=['Number', 'Info'],
                           engine='python')
        return info

    def getMovies(self):
        movies_title = [
            'MovieID', 'MovieTitle', 'Date', 'Url', 'unknown', 'Action',
            'Adventure', 'Animation', 'Children\'s\'', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
            'Western'
        ]

        lattr1 = [0, 1, 2, 4]
        lattr2 = [i for i in range(5, 24)]
        movies = pd.read_csv(self.itemFile,
                             header=None,
                             sep='|',
                             usecols=lattr1 + lattr2,
                             names=movies_title,
                             engine='python')
        movies_title = [
            'MovieID', 'MovieTitle', 'unknown', 'Action',
            'Adventure', 'Animation', 'Children\'s\'', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
            'Western'
        ]
        movies = movies[movies_title]
        return movies
    
    def getMoviesInfo(self):
        movies = self.merge_rating_movies()
        movies = movies.drop(["Rating"], axis=1)
        movieDict = {}
        for index, row in movies.iterrows():
            if row[1] not in movieDict.keys():
                movieDict[row[1]] = [row.values]
            else:
                movieDict[row[1]].append(row.values)
        return movieDict

    def getUser(self):
        occupation = pd.read_csv(self.OcptFile,
                                 header=None,
                                 sep='\s',
                                 names=['Occupation'],
                                 engine='python')

        users = pd.read_csv(
            self.userFile,
            header=None,
            sep='|',
            names=['UserID', 'Age', 'Gender', 'Occupation', 'ZipCode'],
            engine='python')
        # 将age值转化为连续的数字
        age_map = {val: ii for ii, val in enumerate(set(users['Age']))}
        users['Age'] = users['Age'].map(age_map)

        # 将zipcode值转化为连续的数字
        zip_map = {val: ii for ii, val in enumerate(set(users['ZipCode']))}
        users['ZipCode'] = users['ZipCode'].map(zip_map)

        # 将F转化为0，M转化为1
        if users['Gender'].dtype != 'int64':
            gender_map = {'F': 0, 'M': 1}
            users['Gender'] = users['Gender'].map(gender_map)

        # 将各个Occupation转化为对应的ID
        if users['Occupation'].dtype != 'int64':
            occupation_map = {}
            for index, row in occupation.iterrows():
                occupation_map[row[0].lower()] = index
            users['Occupation'] = users['Occupation'].map(occupation_map)

        return users

    def merge_rating_movies(self):
        rating = self.getRating()
        movie = self.getMovies()
        movie = movie.drop('MovieTitle', 1)
        user = self.getUser()
        temp = pd.merge(rating, user, on='UserID')
        new = pd.merge(temp, movie, on='MovieID')
        return new
    
    def merge_lrRating_movies(self):
        rating = self.getLrRating()
        movie = self.getMovies()
        movie = movie.drop('MovieTitle', 1)
        user = self.getUser()
        temp = pd.merge(rating, user, on='UserID')
        new = pd.merge(temp, movie, on='MovieID')
        return new

## User-based CF
创建用户相似矩阵(未实现,目前用字典代替)，筛选出相似度最高的用户，挑选其中评分最高的电影。

In [298]:
class UserCF:
    def __init__(self, data, trainData, testData, logReg=None):
        if logReg:
            self.lr = logReg
        else:
            self.lr = None
        self.Data = data
        self.trainData = trainData
        self.testData = testData
        self.MovieDict = self.classifyMovie(data)
        self.trainMovieDict = self.classifyMovie(trainData)
        self.testMovieDict = self.classifyMovie(testData)
        self.movieInfo = self.Data.getMoviesInfo()
    
    def coldStart(self, lr):
        ratings = self.Data.getRating()
        movies = self.Data.getMovies().drop(['MovieTitle'], axis=1)
        users = self.Data.getUser()
        
        userNum = users['UserID'].values.argmax() + 1
        movieNum = movies['MovieID'].values.argmax() + 1
        
        # 创建一个字典存 每个user看过的电影， 格式 {userID： movieList[]}
        # 被评论超过平均值次数的movie列为热门电影
        uDict = {}
        popular = []
        result = []  # 所有评论过的movie
        
        uTemp = pd.merge(ratings, users, on='UserID')
        for uid in range(1, userNum+1):
            temp = uTemp[(uTemp.UserID == uid)]
            uDict[uid] = temp['MovieID'].values
            result += temp['MovieID'].values.tolist()
        
        counter = Counter(result)
        maxCommented = max(counter.values())
        minCommented = min(counter.values())
        avgCommented = (maxCommented + minCommented) / 2
        for mid in counter.keys():
            if counter[mid] >= avgCommented:
                popular.append(mid)
        
        # new Ratings needs to insert
        nrLs = [] # new rating list
        for pMid in popular:
            for uid in uDict.keys():
                if pMid not in uDict[uid]:
                    nrLs.append([uid, pMid, 0])
        nRatings = pd.DataFrame(nrLs, columns=['UserID', 'MovieID', 'Rating'])
        dataSet = pd.merge(nRatings, users, on='UserID')
        dataSet = pd.merge(dataSet, movies, on='MovieID')
        
        predX = dataSet[['UserID', 'MovieID', 'Age', 'Gender', 'Occupation', 'ZipCode']]
        predY = lr.predict(predX)
        
        nRatings = nRatings.drop(['Rating'], axis = 1)
        nRatings['Rating'] = predY
        return nRatings
    
    def classifyMovie(self, data):
        movieDict = {}
        df_rate = data.getRating()
        if self.lr:
            nRatings = self.coldStart(self.lr)
            df_rate = pd.concat([df_rate, nRatings])
        df_movie = data.getMovies()
        rating_movies = pd.merge(df_rate, df_movie, on='MovieID').sort_values('UserID')
       
        for index, row in rating_movies.iterrows():
            if not row["UserID"] in movieDict.keys():
                movieDict[row["UserID"]] = {row["MovieID"]: (row["Rating"], row["MovieTitle"])}
            else:
                movieDict[row["UserID"]][row["MovieID"]] = (row["Rating"], row["MovieTitle"])
        return movieDict
    
    def euclidean(self, user1, user2):
        # movieDict = self.classifyMovie()
        # pull out two users from movieDict
        user1_data = self.trainMovieDict[user1]
        user2_data = self.trainMovieDict[user2]
        distance = 0
        # cal euclidean distance
        for key in user1_data.keys():
            if key in user2_data.keys():
                # the smaller, the more simularity
                distance += pow(float(user1_data[key][0])-float(user2_data[key][0]),2)
        return 1 / (1 + math.sqrt(distance))
   
    # 这里应该用一张 维度是(userNum, userNum)的矩阵去记录每个用户的相似度(未完成)
    def topSim(self, userID):
        res = []
        for uid in self.MovieDict.keys():
            if not uid == userID:
                similarity = self.euclidean(userID,uid)
                res.append((uid,similarity))
        res.sort(key=lambda val:val[1])
        return res[0]
    
    # 预测可以根据年份去优先预测比较新的高分电影（并未实现）
    def predict(self, user, N, threshold):
        top_sim_user = self.topSim(user)[0]
        rec = []
        rec_list = []
        items = self.trainMovieDict[top_sim_user]
        for item in items.keys():
            if item not in self.trainMovieDict[user].keys():
                if items[item][0] >= threshold:
                    rec_list.append(item)
                    rec.append((item,items[item]))
        rec.sort(key=lambda val:val[1],reverse=True)
        return rec[:N], rec_list
    
    def lrPredict(self, lr, user, N, threshold):
        movies = self.Data.getMovies().drop(['MovieTitle'], axis=1)
        users = self.Data.getUser()
        top_sim_user = self.topSim(user)[0]
        items = items = self.trainMovieDict[top_sim_user]
        recMovies = []
        for item in items.keys():
            if item not in self.trainMovieDict[user].keys():
                if items[item][0] >= threshold:
                    recMovies.append([user, item])
        lrRatings = pd.DataFrame(recMovies, columns=['UserID', 'MovieID'])
        dataSet = pd.merge(lrRatings, users, on='UserID')
        dataSet = pd.merge(dataSet, movies, on='MovieID')
        predX = dataSet[['UserID', 'MovieID', 'Age', 'Gender', 'Occupation', 'ZipCode']]
        predY = lr.predict(predX)
        movieL = predX[['UserID', 'MovieID']].copy()
        movieL['Rating'] = predY
        movieL = movieL[(movieL.Rating >= threshold)]['MovieID'].values
        
        # 得到推荐的 movie list result
        # 开始推荐
        
        result = []
        rec_list = []
        for movie in movieL:
            movie_title = self.trainMovieDict[top_sim_user][movie][1]
            result.append((movie, movie_title))
            rec_list.append(movie)
        return random.sample(result, 10), rec_list
    
    # 评估正确率 precision = R(u) 和 T(u) 重合个数 / R(U)
    # R(u): 在训练集上对用户u推荐N个物品, T(u): 用户u在测试集上评价过的物品集合
    # N是推荐电影数量, N = R(U)
    # 需要分测试集和训练集去计算, 因为推荐系统不会推荐用户评过分的电影
    def evaluation(self, N):
        count = 0
        total = 0
        rAll = 0
        trainMovieDict = self.trainMovieDict
        testMovieDict = self.testMovieDict
        for uid in trainMovieDict.keys():
            if uid not in testMovieDict.keys():
                continue
            t = 0
            pred, _ = self.predict(uid, N, 4)
            for info in pred:
                if info[0] in testMovieDict[uid].keys():
                    t += 1
            p = t / N
            total += p
            count += 1
            rAll += len(testMovieDict[uid])
        return total / count, t / rAll

    def lrEvaluation(self, lr, N):
        count = 0
        total = 0
        rAll = 0
        trainMovieDict = self.trainMovieDict
        testMovieDict = self.testMovieDict
        for uid in trainMovieDict.keys():
            if uid not in testMovieDict.keys():
                continue
            t = 0
            pred = self.lrPredict(lr, uid, N)
            for info in pred:
                if info[0] in testMovieDict[uid].keys():
                    t += 1
            p = t / N
            total += p
            count += 1
            rAll += len(testMovieDict[uid])
        return total / count, t / rAll

In [299]:
data = DataProcess('./ml-100k/u.data')
trainData = DataProcess('./ml-100k/u1.base')
testData = DataProcess('./ml-100k/u1.test')

## 机器学习 拆分数据

### Logistic Regression
用逻辑回归预测missing rating value 冷启动

In [300]:
trainLr = trainData.merge_lrRating_movies()
trainX = trainLr[['UserID', 'MovieID', 'Age', 'Gender', 'Occupation', 'ZipCode']]
trainY = trainLr['Rating']

testLr = testData.merge_lrRating_movies()
testX = testLr[['UserID', 'MovieID', 'Age', 'Gender', 'Occupation', 'ZipCode']]
testY = testLr['Rating']

lr = LogisticRegression()
lr.fit(trainX, trainY.ravel())
predY = lr.predict(testX)
# acc_log = round(lr.score(trainX, trainY) * 100, 2)
mse = mean_squared_error(testY, predY)
mse



1.4359

In [289]:
# userCF_lr = UserCF(data, trainData, testData, lr)
userCF = UserCF(data, trainData, testData)

In [290]:
# recommend without prediction
userCF.evaluation(10)

(0.16710239651416123, 0.0)

In [291]:
# recommend with predicition
userCF.lrEvaluation(lr, 10)

(0.008932461873638347, 0.0)

In [292]:
# userCF start with lr
userCF_lr = UserCF(data, trainData, testData, lr)

In [293]:
# recommend without prediction
userCF_lr.evaluation(10)

(0.07847295864262993, 0.0)

In [294]:
# recommend with predicition
userCF_lr.lrEvaluation(lr, 10)

(0.006786850477200417, 0.0)

In [302]:
userCF = UserCF(data, trainData, testData)
userCF_lr = UserCF(data, trainData, testData, lr)

result, rec_list = userCF.predict(500,10,4)
result_lr, rec_list_lr = userCF_lr.lrPredict(lr,500,10,4)

[218, 949, 317, 403, 185, 29, 392, 842, 397, 655, 418, 728, 1218, 38, 736, 571, 66, 67, 433, 356, 357, 85, 427, 302, 480, 378, 429, 939, 1063, 420, 624, 419, 22, 219, 64, 794, 376, 710, 732, 658, 132, 193, 447, 347, 47, 954, 404, 12, 4, 452, 195, 659, 792, 1044, 73, 388, 187, 173, 1053, 651, 5, 23, 445, 586, 197, 288, 79, 191, 1041, 99]
[191, 38, 452, 23, 378, 480, 187, 85, 420, 732, 67, 218, 47, 347, 317, 357, 73, 193, 404, 66, 624, 659, 197, 219, 586, 12, 447, 427, 571, 64, 356, 185, 658, 388, 418, 433, 376, 429, 403, 132, 655, 397, 651, 99, 445, 736, 5, 419, 792, 728, 392, 710, 29, 4, 794]


In [304]:
train_set_lr = userCF_lr.trainMovieDict
test_set_lr = userCF_lr.testMovieDict
train_set = userCF.trainMovieDict
test_set = userCF.testMovieDict

In [None]:
# recommend_list 必须要在recommend函数里面另外返回一个like rec_list=[3,44,324,623,....]
def Recall(train, test, N, K, recommend_list):
    hit = 0
    all = 0
    for user in train.keys():
        if user not in test.keys():
            continue
        tu = test[user]
        for item in recommend_list:
            if item in tu:
                hit += 1
        all += len(tu)
    return hit/(all*1.0)

def Precision(train, test, N, K, recommend_list):
    hit = 0
    all = 0
    for user in train.keys():
        if user not in test.keys():
            continue
        tu = test[user]
#         recommend_list = get_recommendation(N, user, K, W, train)
        for item in recommend_list:
            if item in tu:
                hit += 1
            all += N
    return hit/(all*1.0)


def All_item(train):
    all_items = set()
    for user in train.keys():
        for item in train[user]:
            all_items.add(item)
    return all_items
 
def Coverage(train, test, N, K, recommend_list,all_items):
    recommend_items = set()
    for user in train.keys():
#         recommend_list = get_recommendation(N, user, K, W, train)
        for item in recommend_list:
            recommend_items.add(item)
    return len(recommend_items)/(len(all_items)*1.0)

def Item_popularity(train):
    item_popularity = {}
    for user, items in train.items():
        for item in items:
            if item not in item_popularity.keys():
                item_popularity[item] = 0
            item_popularity[item] += 1
    return item_popularity
 
def Popularity(train, test, N, K, recommend_list, item_popularity):  
    ret = 0
    n = 0
    for user in train.keys():
#         recommend_list = get_recommendation(N, user, K, W, train)
        for item in recommend_list:
            ret += np.log(1+item_popularity[item])
            n += 1
    return ret/(n*1.0)

recall_lr = Recall(train_set_lr, test_set_lr, 10, 10, rec_list_lr)
precision_lr=Precision(train_set_lr, test_set_lr, 10, 10, rec_list_lr)
cover_lr = Coverage(train_set_lr, test_set_lr, 10, 10, rec_list_lr,All_item(train_set_lr))
popular_lr = Popularity(train_set_lr, test_set_lr, 10, 10, rec_list_lr, Item_popularity(train_set_lr))
recall = Recall(train_set, test_set, 10, 10, rec_list)
precision=Precision(train_set, test_set, 10, 10, rec_list)
cover = Coverage(train_set, test_set, 10, 10, rec_list,All_item(train_set))
popular = Popularity(train_set, test_set, 10, 10, rec_list, Item_popularity(train_set))

print('lr result: Recall:{}; Precision:{}; Coverage:{}; Popularity:{}'.format(recall_lr,precision_lr,cover_lr,popular_lr))
print('No result: recall:{}; precision:{}; Coverage:{}; Popularity:{}'.format(recall,precision,cover,popular))

### KNN

In [53]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(trainX, trainY.ravel())
predY = knn.predict(testX)
mse = mean_squared_error(testY, predY)
mse

0.3915

### random forest

In [55]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(trainX, trainY.ravel())
predY = rf.predict(testX)
mse = mean_squared_error(testY, predY)
mse

0.3575