# 电影推荐系统

In [229]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import defaultdict
from collections import Counter
import re
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## 数据概览

本推荐系统使用的是MovieLens 100K 数据集。

数据集分为6个文件：电影评分 u.data,电影类别 u.genre,数据集概要 u.info,电影信息 u.info,职业列表 u.occupation,用户信息 u.user。

### 电影评分
文件里包含：用户id，电影id，评分（1-5），时间戳。

数据中的格式：UserID MovieID Rating。

In [230]:
ratings = pd.read_csv("./ml-100k/u.data", 
                      header=None, 
                      sep='\s', 
                      names=['UserID', 'MovieID', 'Rating','TimeStamp'], 
                      engine='python')
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 电影类别
文件里包含：genre，genreID。

数据中的格式：genre | genreID。

In [231]:
genres = pd.read_csv("./ml-100k/u.genre", 
                      header=None, 
                      sep='|', 
                      names=['Genre', 'GenreID'], 
                      engine='python')
genres

Unnamed: 0,Genre,GenreID
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


### 数据集概要
文件包含：用户数量，电影数量，评分数量。

共有943个用户参加评分，电影数量1682部， 100000条评分。

In [232]:
info = pd.read_csv("./ml-100k/u.info", 
                      header=None, 
                      sep='\s', 
                      names=['Number', 'Info'], 
                      engine='python')
info

Unnamed: 0,Number,Info
0,943,users
1,1682,items
2,100000,ratings


### 电影信息
文件包含：电影ID，电影名称（含年份），电影放映日期，imdb评分链接， 电影类别

数据中的格式：MovieID | MovieTitle | ReleaseDate | ImdbUrl | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy  | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western

PS. 需要整理成 mid mtitle date genre(list)的形式

In [233]:
movies_title= ['MovieID', 'MovieTitle', 'Date','Url', 'unknown', 
               'Action', 'Adventure', 'Animation', 'Children\'s\'',
               'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
               'Film-Noir', 'Horror', 'Musical', 'Mystery','Romance',
               'Sci-Fi', 'Thriller', 'War', 'Western']
lattr1 = [0, 1, 2, 4]
lattr2 = [i for i in range(5, 24)]

movies = pd.read_csv("./ml-100k/u.item", 
                      header=None, 
                      sep='|', 
                      usecols=lattr1+lattr2,
                      names=movies_title, 
                      engine='python')
movies.head()

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 职业列表
文件包含：多种职业

PS. 整理成 Oid(0-20)的形式

In [234]:
occupation = pd.read_csv("./ml-100k/u.occupation", 
                      header=None, 
                      sep='\s', 
                      names=['Occupation'], 
                      engine='python')
occupation

Unnamed: 0,Occupation
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


### 用户信息
文件包含：用户ID，用户年龄，用户性别，用户职业，用户邮编

数据中的格式：UserID | Age | Gender | Occupation | ZipCode

In [235]:
users = pd.read_csv("./ml-100k/u.user", 
                      header=None, 
                      sep='|', 
                      names=['UserID', 'Age', 'Gender', 'Occupation', 'ZipCode'], 
                      engine='python')
users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## 数据处理

- UserID, MovieID不变
- Occupation处理成ID形式（0-20）
- Genre
- title忽略
- Age
- Gender转变为0和1
- zipcode

- rating是target

## 处理评分信息
4-5分为1， 0-3为0

In [236]:
rating_map = {5:1, 4:1, 3:0, 2:0, 1:0}
ratings['Rating'] = ratings['Rating'].map(rating_map)
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
0,196,242,0,881250949
1,186,302,0,891717742
2,22,377,0,878887116
3,244,51,0,880606923
4,166,346,0,886397596


### 处理用户信息

In [237]:
# 将age值转化为连续的数字
age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
users['Age'] = users['Age'].map(age_map)

# 将zipcode值转化为连续的数字
zip_map = {val:ii for ii,val in enumerate(set(users['ZipCode']))}
users['ZipCode'] = users['ZipCode'].map(zip_map)

# 将F转化为0，M转化为1
if users['Gender'].dtype != 'int64':
    gender_map = {'F': 0, 'M': 1}
    users['Gender'] = users['Gender'].map(gender_map)

# 将各个Occupation转化为对应的ID
if users['Occupation'].dtype != 'int64':
    occupation_map = {}
    for index, row in occupation.iterrows():
        occupation_map[row[0].lower()] = index
    users['Occupation'] = users['Occupation'].map(occupation_map)

users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,14,1,19,544
1,2,43,0,13,537
2,3,13,1,20,387
3,4,14,1,19,764
4,5,23,0,13,46


### 处理电影信息

In [238]:
rating_map = {5: 1, 4: 1, 3: 0, 2: 0, 1: 0}
# movies['Rating'] = movies['Rating'].map(rating_map)
movies.head()

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 数据集处理代码
创建用户相似矩阵，筛选出相似度最高的用户，挑选其中评分最高的电影。

In [248]:
class DataProcess:
    def __init__(self, dataFile):
        self.genreFile = "./ml-100k/u.genre"
        self.itemFile = "./ml-100k/u.item"
        self.userFile = "./ml-100k/u.user"
        self.infoFile = "./ml-100k/u.info"
        self.OcptFile = "./ml-100k/u.occupation"
        self.dataFile = dataFile

    def getRating(self):
        ratings = pd.read_csv(
            self.dataFile,
            header=None,
            sep='\s',
            names=['UserID', 'MovieID', 'Rating', 'TimeStamp'],
            engine='python')
        ratings = ratings[['UserID', 'MovieID', 'Rating']]
        return ratings
    
    def getLrRating(self):
        ratings = pd.read_csv(
            self.dataFile,
            header=None,
            sep='\s',
            names=['UserID', 'MovieID', 'Rating', 'TimeStamp'],
            engine='python')
        ratings = ratings[['UserID', 'MovieID', 'Rating']]    
        ratings['Rating'] = ratings['Rating'].map({5:1, 4:1, 3:0, 2:0, 1:0})
        return ratings

    def getGenre(self):
        genres = pd.read_csv(self.genreFile,
                             header=None,
                             sep='|',
                             names=['Genre', 'GenreID'],
                             engine='python')
        return genres

    def getInfo(self):
        info = pd.read_csv(self.infoFile,
                           header=None,
                           sep='\s',
                           names=['Number', 'Info'],
                           engine='python')
        return info

    def getMovies(self):
        movies_title = [
            'MovieID', 'MovieTitle', 'Date', 'Url', 'unknown', 'Action',
            'Adventure', 'Animation', 'Children\'s\'', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
            'Western'
        ]

        lattr1 = [0, 1, 2, 4]
        lattr2 = [i for i in range(5, 24)]
        movies = pd.read_csv(self.itemFile,
                             header=None,
                             sep='|',
                             usecols=lattr1 + lattr2,
                             names=movies_title,
                             engine='python')
        movies_title = [
            'MovieID', 'MovieTitle', 'unknown', 'Action',
            'Adventure', 'Animation', 'Children\'s\'', 'Comedy', 'Crime',
            'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror',
            'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War',
            'Western'
        ]
        movies = movies[movies_title]
        return movies
    
    def getMoviesInfo(self):
        movies = self.merge_rating_movies()
        movies = movies.drop(["Rating"], axis=1)
        movieDict = {}
        for index, row in movies.iterrows():
            if row[1] not in movieDict.keys():
                movieDict[row[1]] = [row.values]
            else:
                movieDict[row[1]].append(row.values)
        return movieDict

    def getUser(self):
        occupation = pd.read_csv(self.OcptFile,
                                 header=None,
                                 sep='\s',
                                 names=['Occupation'],
                                 engine='python')

        users = pd.read_csv(
            self.userFile,
            header=None,
            sep='|',
            names=['UserID', 'Age', 'Gender', 'Occupation', 'ZipCode'],
            engine='python')
        # 将age值转化为连续的数字
        age_map = {val: ii for ii, val in enumerate(set(users['Age']))}
        users['Age'] = users['Age'].map(age_map)

        # 将zipcode值转化为连续的数字
        zip_map = {val: ii for ii, val in enumerate(set(users['ZipCode']))}
        users['ZipCode'] = users['ZipCode'].map(zip_map)

        # 将F转化为0，M转化为1
        if users['Gender'].dtype != 'int64':
            gender_map = {'F': 0, 'M': 1}
            users['Gender'] = users['Gender'].map(gender_map)

        # 将各个Occupation转化为对应的ID
        if users['Occupation'].dtype != 'int64':
            occupation_map = {}
            for index, row in occupation.iterrows():
                occupation_map[row[0].lower()] = index
            users['Occupation'] = users['Occupation'].map(occupation_map)

        return users

    def merge_rating_movies(self):
        rating = self.getRating()
        movie = self.getMovies()
        movie = movie.drop('MovieTitle', 1)
        user = self.getUser()
        temp = pd.merge(rating, user, on='UserID')
        new = pd.merge(temp, movie, on='MovieID')
        return new
    
    def merge_lrRating_movies(self):
        rating = self.getLrRating()
        movie = self.getMovies()
        movie = movie.drop('MovieTitle', 1)
        user = self.getUser()
        temp = pd.merge(rating, user, on='UserID')
        new = pd.merge(temp, movie, on='MovieID')
        return new

## User-based CF
创建用户相似矩阵(未实现,目前用字典代替)，筛选出相似度最高的用户，挑选其中评分最高的电影。

In [338]:
class UserCF:
    def __init__(self, data, trainData, testData):
        self.Data = data
        self.trainData = trainData
        self.testData = testData
        self.MovieDict = self.classifyMovie(data)
        self.trainMovieDict = self.classifyMovie(trainData)
        self.testMovieDict = self.classifyMovie(testData)
        self.movieInfo = self.Data.getMoviesInfo()
    
    def classifyMovie(self, data):
        movieDict = {}
        df_rate = data.getRating()
        df_movie = data.getMovies()
        rating_movies = pd.merge(df_rate, df_movie, on='MovieID').sort_values('UserID')
       
        for index, row in rating_movies.iterrows():
            if not row["UserID"] in movieDict.keys():
                movieDict[row["UserID"]] = {row["MovieID"]: (row["Rating"], row["MovieTitle"])}
            else:
                movieDict[row["UserID"]][row["MovieID"]] = (row["Rating"], row["MovieTitle"])
        return movieDict
    
    def euclidean(self, user1, user2):
        # movieDict = self.classifyMovie()
        # pull out two users from movieDict
        user1_data = self.trainMovieDict[user1]
        user2_data = self.trainMovieDict[user2]
        distance = 0
        # cal euclidean distance
        for key in user1_data.keys():
            if key in user2_data.keys():
                # the smaller, the more simularity
                distance += pow(float(user1_data[key][0])-float(user2_data[key][0]),2)
        return 1 / (1 + math.sqrt(distance))
   
    # 这里应该用一张 维度是(userNum, userNum)的矩阵去记录每个用户的相似度(未完成)
    def topSim(self, userID):
        res = []
        for uid in self.MovieDict.keys():
            if not uid == userID:
                similarity = self.euclidean(userID,uid)
                res.append((uid,similarity))
        res.sort(key=lambda val:val[1])
        return res[:10]
    
    # 预测可以根据年份去优先预测比较新的高分电影（并未实现）
    def predict(self, user, N):
        top_sim_user = self.topSim(user)[0][0]
        items = self.trainMovieDict[top_sim_user]
        rec = []
        for item in items.keys():
            if item not in self.trainMovieDict[user].keys():
                if items[item][0] >= 4:
                    rec.append((item,items[item]))
        rec.sort(key=lambda val:val[1],reverse=True)
#         result = rec[:N]
        result = rec
        #for i in range(len(result)):
        #    print("Top",i+1," MovieTitle: ",result[i][1][1]," Rating: ",result[i][1][0])
        return result
    
    def lrPredict(self, lr, user, N):
        movieInfo = self.movieInfo
        top_sim_user = self.topSim(user)[0][0]
        items = self.trainMovieDict[top_sim_user]
        rec = []
        for movie in items.keys():
            if movie not in self.trainMovieDict[user].keys():
                pred = lr.predict(np.array(movieInfo[movie]))
                # 算出共同评价过的电影在所有被点评的次数中好评的百分比
                # 即 1 的个数 / pred的长度
                # 若果比例超过 3 / 5 = 0.6, 就推荐
                total = len(pred)
                count = Counter(pred)[1]
                if count / total >= 0.4 and count / total != 1.0:
                    temp = (count/total, items[movie][1])
                    rec.append((movie ,temp))
                
        rec.sort(key=lambda val:val[1],reverse=True)
        result = rec        
        return result
    
    # 评估正确率 precision = R(u) 和 T(u) 重合个数 / R(U)
    # R(u): 在训练集上对用户u推荐N个物品, T(u): 用户u在测试集上评价过的物品集合
    # N是推荐电影数量, N = R(U)
    # 需要分测试集和训练集去计算, 因为推荐系统不会推荐用户评过分的电影
    def evaluation(self, N):
        count = 0
        total = 0
        trainMovieDict = self.trainMovieDict
        testMovieDict = self.testMovieDict
        for uid in trainMovieDict.keys():
            if uid not in testMovieDict.keys():
                continue
            t = 0
            pred = self.predict(uid, N)
            for info in pred:
                if info[0] in testMovieDict[uid].keys():
                    t += 1
            p = t / N
            total += p
            count += 1
        return total / count

    def lrEvaluation(self, lr, N):
        count = 0
        total = 0
        trainMovieDict = self.trainMovieDict
        testMovieDict = self.testMovieDict
        for uid in trainMovieDict.keys():
            if uid not in testMovieDict.keys():
                continue
            t = 0
            pred = self.lrPredict(lr, uid, N)
            for info in pred:
                if info[0] in testMovieDict[uid].keys():
                    t += 1
            p = t / N
            total += p
            count += 1
        return total / count

In [339]:
data = DataProcess('./ml-100k/u.data')
trainData = DataProcess('./ml-100k/u1.base')
testData = DataProcess('./ml-100k/u1.test')
userCF = UserCF(data, trainData, testData)

## 机器学习 拆分数据

In [340]:
# trainDataLr = trainData.merge_rating_movies()
# trainX = trainDataLr.drop(['Rating'], axis=1).values
# trainY = trainDataLr[['Rating']].values

# testDataLr = testData.merge_rating_movies()
# testX = testDataLr.drop(['Rating'], axis=1).values
# testY = testDataLr[['Rating']].values

trainDataLr = trainData.merge_lrRating_movies()
trainX = trainDataLr.drop(['Rating'], axis=1).values
trainY = trainDataLr[['Rating']].values

testDataLr = testData.merge_lrRating_movies()
testX = testDataLr.drop(['Rating'], axis=1).values
testY = testDataLr[['Rating']].values

### Logistic Regression
用逻辑回归预测missing rating value

In [341]:
lr = LogisticRegression()
lr.fit(trainX, trainY.ravel())
predY = lr.predict(testX)
# acc_log = round(lr.score(trainX, trainY) * 100, 2)
mse = mean_squared_error(testY, predY)
mse



0.3897

In [342]:
userCF.predict(1, 10)

[(208, (5, 'Young Frankenstein (1974)')),
 (132, (5, 'Wizard of Oz, The (1939)')),
 (12, (5, 'Usual Suspects, The (1995)')),
 (427, (5, 'To Kill a Mockingbird (1962)')),
 (23, (5, 'Taxi Driver (1976)')),
 (655, (5, 'Stand by Me (1986)')),
 (143, (5, 'Sound of Music, The (1965)')),
 (64, (5, 'Shawshank Redemption, The (1994)')),
 (736, (5, 'Shadowlands (1993)')),
 (288, (5, 'Scream (1996)')),
 (174, (5, 'Raiders of the Lost Ark (1981)')),
 (842, (5, 'Pollyanna (1960)')),
 (735, (5, 'Philadelphia (1993)')),
 (357, (5, "One Flew Over the Cuckoo's Nest (1975)")),
 (1053, (5, 'Now and Then (1995)')),
 (219, (5, 'Nightmare on Elm Street, A (1984)')),
 (939, (5, 'Murder in the First (1995)')),
 (73, (5, 'Maverick (1994)')),
 (392, (5, 'Man Without a Face, The (1993)')),
 (1063, (5, 'Little Princess, A (1995)')),
 (452, (5, 'Jaws 2 (1978)')),
 (794, (5, 'It Could Happen to You (1994)')),
 (559, (5, 'Interview with the Vampire (1994)')),
 (949, (5, 'How to Make an American Quilt (1995)')),
 (37

In [343]:
userCF.lrPredict(lr, 1, 10)

[(655, (0.9955947136563876, 'Stand by Me (1986)')),
 (233, (0.9919354838709677, 'Under Siege (1992)')),
 (219, (0.990990990990991, 'Nightmare on Elm Street, A (1984)')),
 (226, (0.9879518072289156, 'Die Hard 2 (1990)')),
 (949, (0.971830985915493, 'How to Make an American Quilt (1995)')),
 (416, (0.96875, 'Old Yeller (1957)')),
 (417, (0.958904109589041, 'Parent Trap, The (1961)')),
 (855, (0.9545454545454546, 'Diva (1981)')),
 (558, (0.9285714285714286, 'Heavenly Creatures (1994)')),
 (661, (0.9204545454545454, 'High Noon (1952)')),
 (955, (0.9183673469387755, 'Before Sunrise (1995)')),
 (621,
  (0.9090909090909091, 'Davy Crockett, King of the Wild Frontier (1955)')),
 (625, (0.9024390243902439, 'Sword in the Stone, The (1963)')),
 (561, (0.8983050847457628, "Mary Shelley's Frankenstein (1994)")),
 (559, (0.8978102189781022, 'Interview with the Vampire (1994)')),
 (646, (0.8947368421052632, 'Once Upon a Time in the West (1969)')),
 (778, (0.8947368421052632, 'Don Juan DeMarco (1995)')

In [344]:
userCF.evaluation(10)

0.8235294117647053

In [345]:
userCF.lrEvaluation(lr, 10)

0.1810457516339868

### 创建一个新的df存取所有的电影评分

In [None]:
# for index, row in n.iterrows():
#     uid = int(row[0])
#     mid = int(row[1])
#     if mid in rDict[uid].keys():
#         row[2] = rDict[uid][mid][0]
# n.iloc[:, 2] = list(y)
# for index, row in n.iterrows():
#     print(row[0], row[1], row[2])

### KNN

In [53]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(trainX, trainY.ravel())
predY = knn.predict(testX)
mse = mean_squared_error(testY, predY)
mse

0.3915

### random forest

In [55]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(trainX, trainY.ravel())
predY = rf.predict(testX)
mse = mean_squared_error(testY, predY)
mse

0.3575