In [22]:
import re
import pandas as pd
from sklearn.utils import shuffle

### 开始处理数据，从 user data 开始

In [2]:
users_title = ['UserID', 'Gender', 'Age', 'JobID', 'ZipCode']
users = pd.read_table(
    'data/movielens-1m/ml-1m/users.dat',
    sep='::',
    header=None,
    names=users_title,
    engine='python',
    encoding='ISO-8859-1'
)

In [3]:
users

Unnamed: 0,UserID,Gender,Age,JobID,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [4]:
users = users.filter(regex='UserID|Gender|Age|JobID|ZipCode')

In [5]:
users

Unnamed: 0,UserID,Gender,Age,JobID,ZipCode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


#### process the gender and age of user

In [6]:
gender_map = {'F':0, 'M':1}
users['Gender'] = users['Gender'].map(gender_map)

In [7]:
users

Unnamed: 0,UserID,Gender,Age,JobID,ZipCode
0,1,0,1,10,48067
1,2,1,56,16,70072
2,3,1,25,15,55117
3,4,1,45,7,02460
4,5,1,25,20,55455
...,...,...,...,...,...
6035,6036,0,25,15,32603
6036,6037,0,45,1,76006
6037,6038,0,56,1,14706
6038,6039,0,45,0,01060


In [8]:
age_map = {val: ii for ii, val in enumerate(set(users['Age']))}
age_map

{1: 0, 35: 1, 45: 2, 50: 3, 18: 4, 56: 5, 25: 6}

In [9]:
users['Age'] = users['Age'].map(age_map)

### 开始处理 movie data

In [10]:
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_table(
    'data/movielens-1m/ml-1m/movies.dat',
    sep='::',
    header=None,
    names=movies_title,
    engine='python',
    encoding='ISO-8859-1'
)
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


split the title and year in Feature:'Title'

In [11]:
pattern = re.compile(r'^(.*)\((\d+)\)$')

In [12]:
title_map = {
    val: pattern.match(val).group(1)
    for ii, val in enumerate(set(movies['Title']))
}
year_map = {
    val: pattern.match(val).group(2)
    for ii, val in enumerate(set(movies['Title']))
}
movies['Year'] = movies['Title'].map(year_map)
movies['Title'] = movies['Title'].map(title_map)
movies

Unnamed: 0,MovieID,Title,Genres,Year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents,Comedy,2000
3879,3949,Requiem for a Dream,Drama,2000
3880,3950,Tigerland,Drama,2000
3881,3951,Two Family House,Drama,2000


### 开始处理 rating data

In [13]:
ratings_title = ['UserID', 'MovieID', 'ratings', 'timestamps']
ratings = pd.read_table(
    'data/movielens-1m/ml-1m/ratings.dat',
    sep='::',
    header=None,
    names=ratings_title,
    engine='python',
    encoding='ISO-8859-1'
)
ratings

Unnamed: 0,UserID,MovieID,ratings,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [14]:
ratings = ratings.filter(regex='UserID|MovieID|ratings')

ratings of 4 and 5 are viewed as positive samples `[label:1]`

ratings of 0,1 and 2 are viewed as negative samples `[label:0]`

discard samples of rating = 3

In [15]:
label_map = {1:0, 2:0, 3:2, 4:1, 5:1}
ratings['label'] = ratings['ratings'].map(label_map)
ratings

Unnamed: 0,UserID,MovieID,ratings,label
0,1,1193,5,1
1,1,661,3,2
2,1,914,3,2
3,1,3408,4,1
4,1,2355,5,1
...,...,...,...,...
1000204,6040,1091,1,0
1000205,6040,1094,5,1
1000206,6040,562,5,1
1000207,6040,1096,4,1


#### concat users, movies and ratings

In [17]:
data = pd.merge(pd.merge(ratings, users), movies)
data

Unnamed: 0,UserID,MovieID,ratings,label,Gender,Age,JobID,ZipCode,Title,Genres,Year
0,1,1193,5,1,0,0,10,48067,One Flew Over the Cuckoo's Nest,Drama,1975
1,2,1193,5,1,1,5,16,70072,One Flew Over the Cuckoo's Nest,Drama,1975
2,12,1193,4,1,1,6,12,32793,One Flew Over the Cuckoo's Nest,Drama,1975
3,15,1193,4,1,1,6,7,22903,One Flew Over the Cuckoo's Nest,Drama,1975
4,17,1193,5,1,1,3,1,95350,One Flew Over the Cuckoo's Nest,Drama,1975
...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,1,1,4,17,47901,Modulations,Documentary,1998
1000205,5675,2703,3,2,1,1,14,30030,Broken Vessels,Drama,1998
1000206,5780,2845,1,0,1,4,17,92886,White Boys,Drama,1999
1000207,5851,3607,5,1,0,4,20,55410,One Little Indian,Comedy|Drama|Western,1973


let filed `label` to position 1

In [18]:
new_order = ['label'] + [col for col in data.columns if col != 'label']
new_order

['label',
 'UserID',
 'MovieID',
 'ratings',
 'Gender',
 'Age',
 'JobID',
 'ZipCode',
 'Title',
 'Genres',
 'Year']

In [19]:
data = data.reindex(columns=new_order)

shuffle samples

In [23]:
data = shuffle(data)

### 保存最终的数据

In [24]:
data_new = data[data['label'] < 2]

In [25]:
data.count()

label      1000209
UserID     1000209
MovieID    1000209
ratings    1000209
Gender     1000209
Age        1000209
JobID      1000209
ZipCode    1000209
Title      1000209
Genres     1000209
Year       1000209
dtype: int64

In [26]:
data_new.count()

label      739012
UserID     739012
MovieID    739012
ratings    739012
Gender     739012
Age        739012
JobID      739012
ZipCode    739012
Title      739012
Genres     739012
Year       739012
dtype: int64

In [29]:
data_new[:665110].to_csv(
    r'data/movielens-1m/movies_train_data.csv',
    index=False,
    sep='\t',
    mode='a',
    header=True
)
data_new[665110:].to_csv(
    r'data/movielens-1m/movies_test_data.csv',
    index=False,
    sep='\t',
    mode='a',
    header=True
)