# 电影推荐系统

In [347]:
import pandas as pd
from collections import defaultdict
import re

## 数据概览

本推荐系统使用的是MovieLens 100K 数据集。

数据集分为6个文件：电影评分 u.data,电影类别 u.genre,数据集概要 u.info,电影信息 u.info,职业列表 u.occupation,用户信息 u.user。

### 电影评分
文件里包含：用户id，电影id，评分（1-5），时间戳。

数据中的格式：UserID MovieID Rating。

In [348]:
ratings = pd.read_csv("./ml-100k/u.data", 
                      header=None, 
                      sep='\s', 
                      names=['UserID', 'MovieID', 'Rating','TimeStamp'], 
                      engine='python')
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,TimeStamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


### 电影类别
文件里包含：genre，genreID。

数据中的格式：genre | genreID。

In [349]:
genres = pd.read_csv("./ml-100k/u.genre", 
                      header=None, 
                      sep='|', 
                      names=['Genre', 'GenreID'], 
                      engine='python')
genres

Unnamed: 0,Genre,GenreID
0,unknown,0
1,Action,1
2,Adventure,2
3,Animation,3
4,Children's,4
5,Comedy,5
6,Crime,6
7,Documentary,7
8,Drama,8
9,Fantasy,9


### 数据集概要
文件包含：用户数量，电影数量，评分数量。

共有943个用户参加评分，电影数量1682部， 100000条评分。

In [350]:
info = pd.read_csv("./ml-100k/u.info", 
                      header=None, 
                      sep='\s', 
                      names=['Number', 'Info'], 
                      engine='python')
info

Unnamed: 0,Number,Info
0,943,users
1,1682,items
2,100000,ratings


### 电影信息
文件包含：电影ID，电影名称（含年份），电影放映日期，imdb评分链接， 电影类别

数据中的格式：MovieID | MovieTitle | ReleaseDate | ImdbUrl | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy  | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western

PS. 需要整理成 mid mtitle date genre(list)的形式

In [351]:
movies_title= ['MovieID', 'MovieTitle', 'Date','Url', 'unknown', 
               'Action', 'Adventure', 'Animation', 'Children\'s\'',
               'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
               'Film-Noir', 'Horror', 'Musical', 'Mystery','Romance',
               'Sci-Fi', 'Thriller', 'War', 'Western']
lattr1 = [0, 1, 2, 4]
lattr2 = [i for i in range(5, 24)]

movies = pd.read_csv("./ml-100k/u.item", 
                      header=None, 
                      sep='|', 
                      usecols=lattr1+lattr2,
                      names=movies_title, 
                      engine='python')
movies.head()

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


### 职业列表
文件包含：多种职业

PS. 整理成 Oid(0-20)的形式

In [352]:
occupation = pd.read_csv("./ml-100k/u.occupation", 
                      header=None, 
                      sep='\s', 
                      names=['Occupation'], 
                      engine='python')
occupation

Unnamed: 0,Occupation
0,administrator
1,artist
2,doctor
3,educator
4,engineer
5,entertainment
6,executive
7,healthcare
8,homemaker
9,lawyer


### 用户信息
文件包含：用户ID，用户年龄，用户性别，用户职业，用户邮编

数据中的格式：UserID | Age | Gender | Occupation | ZipCode

In [353]:
users = pd.read_csv("./ml-100k/u.user", 
                      header=None, 
                      sep='|', 
                      names=['UserID', 'Age', 'Gender', 'Occupation', 'ZipCode'], 
                      engine='python')
users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


## 数据处理

- UserID, MovieID不变
- Occupation处理成ID形式（0-20）
- Genre
- title忽略
- Age
- Gender转变为0和1
- zipcode

- rating是target

### 处理用户信息

In [354]:
# 将age值转化为连续的数字
age_map = {val:ii for ii,val in enumerate(set(users['Age']))}
users['Age'] = users['Age'].map(age_map)

# 将zipcode值转化为连续的数字
zip_map = {val:ii for ii,val in enumerate(set(users['ZipCode']))}
users['ZipCode'] = users['ZipCode'].map(zip_map)

# 将F转化为0，M转化为1
if users['Gender'].dtype != 'int64':
    gender_map = {'F': 0, 'M': 1}
    users['Gender'] = users['Gender'].map(gender_map)

# 将各个Occupation转化为对应的ID
if users['Occupation'].dtype != 'int64':
    occupation_map = {}
    for index, row in occupation.iterrows():
        occupation_map[row[0].lower()] = index
    users['Occupation'] = users['Occupation'].map(occupation_map)

users.head()

Unnamed: 0,UserID,Age,Gender,Occupation,ZipCode
0,1,14,1,19,289
1,2,43,0,13,68
2,3,13,1,20,377
3,4,14,1,19,408
4,5,23,0,13,394


### 处理电影信息

In [357]:
movies.head()

Unnamed: 0,MovieID,MovieTitle,Date,Url,unknown,Action,Adventure,Animation,Children's',Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## User-based CF
创建用户相似矩阵，筛选出相似度最高的用户，挑选其中评分最高的电影。

In [None]:
class UserCF:
    def __init__(self):
        self.genreFile = "./ml-100k/u.genre"
        self.itemFile = "./ml-100k/u.item"
        self.userFile = "./ml-100k/u.user"