In [1]:
import numpy as np
import pandas as pd
import re
import gc
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

In [2]:
cols = ['userId', 'movie_id', 'timestamp', 'rating']
cols_test = ['userId', 'movie_id', 'timestamp']
train = pd.read_csv('../input/train.csv',
                         delimiter=',',
                         names=cols,
                         converters={'userId': str, 'movie_id': str, 'timestamp': str})
test = pd.read_csv('../input/test.csv',
                        delimiter=',',
                        names=cols_test,
                        converters={'userId': str, 'movie_id': str, 'timestamp': str})

In [3]:
train.head()

Unnamed: 0,userId,movie_id,timestamp,rating
0,1,1,874965758,5
1,1,2,876893171,3
2,1,3,878542960,4
3,1,4,876893119,3
4,1,5,889751712,3


In [4]:
test.head()

Unnamed: 0,userId,movie_id,timestamp
0,1,7,875071561
1,1,9,878543541
2,1,13,875071805
3,1,15,875071608
4,1,19,875071515


In [5]:
# 增加user和item的其他信息
userInfo = pd.read_csv('../input/userInfo.csv',
                       delimiter=',',
                       converters={'userId': str, 'useGender': str})
itemInfo = pd.read_csv('../input/itemInfo.csv',
                       delimiter=',',
                       index_col=0,
                       converters={'movie_id': str})

In [6]:
userInfo.head()

Unnamed: 0,userId,useAge,useGender,useOccupation,useZipcode
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [7]:
itemInfo.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknowngenres,Action,Adventure,Animation,Childrens,...,Fantasy,Film_Noir,Horror,Musical,Mystery,Romance,Sci_Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [8]:
data = pd.concat([train, test], axis=0, ignore_index=True)
data = data.merge(userInfo, on='userId', how='left')
data = data.merge(itemInfo, on='movie_id', how='left')

del train, test, userInfo, itemInfo
gc.collect()

20

In [None]:
gender_dict = {'M': 0, 'F': 1}
data['useGender'] = data['useGender'].map(gender_dict)

# 邮编处理
data['useZipcode_1'] = data['useZipcode'].str[0:1]
data['useZipcode_2'] = data['useZipcode'].str[0:2]
data['useZipcode_3'] = data['useZipcode'].str[0:3]

# 电影年份和长度
data['movie_year'] = data['movie_title'].str[-5:-1]
data['movie_name_len'] = int(len(data['movie_title'].str[:]) - 6)

# 上映年月日
data['release_date'] = data['release_date'].fillna('0-0-0')
data['release_d'] = data['release_date'].apply(lambda x: x.split('-')[-1])
data['release_m'] = data['release_date'].apply(lambda x: x.split('-')[1])
data['release_y'] = data['release_date'].apply(lambda x: x.split('-')[0])

# 评论时间戳
data['timestamp_2'] = data['timestamp'].str[0:2]
data['timestamp_3'] = data['timestamp'].str[0:3]
data['timestamp_4'] = data['timestamp'].str[0:4]
data['timestamp_5'] = data['timestamp'].str[0:5]
data['timestamp_6'] = data['timestamp'].str[0:6]
data['timestamp_7'] = data['timestamp'].str[0:7]
data['timestamp_8'] = data['timestamp'].str[0:8]