In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.learner import *
from fastai.structured import *
from fastai.column_data import *

# pandas and plotting config
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
PATH = 'data/ml-latest-small'

In [4]:
ratings_df = pd.read_csv(f'{PATH}/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
movies_df = pd.read_csv(f'{PATH}/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
combined_df = pd.merge(ratings_df, movies_df, how='left', on='movieId')
combined_df.rename(columns={'title': 'title_yr'}, inplace=True)
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title_yr,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama


In [7]:
combined_df['reviewDate'] = pd.to_datetime(combined_df.timestamp, unit='s')
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title_yr,genres,reviewDate
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2009-12-14 02:52:24
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,2009-12-14 02:52:59
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,2009-12-14 02:53:02
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,2009-12-14 02:53:05
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,2009-12-14 02:53:25


In [8]:
movie_yr_df = combined_df.title_yr.str.extract('^(?P<title>.*)\((?P<year>\d+)', expand=False)
movie_yr_df.head()

Unnamed: 0,title,year
0,Dangerous Minds,1995
1,Dumbo,1941
2,Sleepers,1996
3,Escape from New York,1981
4,Cinema Paradiso (Nuovo cinema Paradiso),1989


In [9]:
combined_df = pd.concat([combined_df, movie_yr_df], axis=1)
combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title_yr,genres,reviewDate,title,year
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2009-12-14 02:52:24,Dangerous Minds,1995
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,2009-12-14 02:52:59,Dumbo,1941
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,2009-12-14 02:53:02,Sleepers,1996
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,2009-12-14 02:53:05,Escape from New York,1981
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,2009-12-14 02:53:25,Cinema Paradiso (Nuovo cinema Paradiso),1989


In [10]:
genres = combined_df.genres.str.cat(sep='|').split('|')
u_genres = np.unique(genres)

display(u_genres)

array(['(no genres listed)', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary',
       'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
       'Thriller', 'War', 'Western'],
      dtype='<U18')

In [11]:
for g in u_genres[1:]:
    combined_df[g] = combined_df.genres.str.contains(g).astype(int)

combined_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title_yr,genres,reviewDate,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2009-12-14 02:52:24,Dangerous Minds,1995,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,2009-12-14 02:52:59,Dumbo,1941,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,2009-12-14 02:53:02,Sleepers,1996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,2009-12-14 02:53:05,Escape from New York,1981,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,2009-12-14 02:53:25,Cinema Paradiso (Nuovo cinema Paradiso),1989,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [12]:
combined_df.drop('timestamp', axis=1, inplace=True)
combined_df.drop('title_yr', axis=1, inplace=True)
combined_df.drop('genres', axis=1, inplace=True)

In [13]:
combined_df.head()

Unnamed: 0,userId,movieId,rating,reviewDate,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,31,2.5,2009-12-14 02:52:24,Dangerous Minds,1995,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,1029,3.0,2009-12-14 02:52:59,Dumbo,1941,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0
2,1,1061,3.0,2009-12-14 02:53:02,Sleepers,1996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,1,1129,2.0,2009-12-14 02:53:05,Escape from New York,1981,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,1,1172,4.0,2009-12-14 02:53:25,Cinema Paradiso (Nuovo cinema Paradiso),1989,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [14]:
display(DataFrameSummary(combined_df).summary().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,counts,uniques,missing,missing_perc,types
userId,100004.0,347.011,195.164,1.0,182.0,367.0,520.0,671.0,100004,671,0,0%,numeric
movieId,100004.0,12548.7,26369.2,1.0,1028.0,2406.5,5418.0,163949.0,100004,9066,0,0%,numeric
rating,100004.0,3.54361,1.05806,0.5,3.0,4.0,4.0,5.0,100004,10,0,0%,numeric
reviewDate,,,,,,,,,100004,78141,0,0%,date
title,,,,,,,,,100001,8833,3,0.00%,categorical
year,,,,,,,,,100001,103,3,0.00%,categorical
Action,100004.0,0.270549,0.444246,0.0,0.0,0.0,1.0,1.0,100004,2,0,0%,bool
Adventure,100004.0,0.220161,0.414357,0.0,0.0,0.0,0.0,1.0,100004,2,0,0%,bool
Animation,100004.0,0.0616975,0.240607,0.0,0.0,0.0,0.0,1.0,100004,2,0,0%,bool
Children,100004.0,0.0867965,0.281538,0.0,0.0,0.0,0.0,1.0,100004,2,0,0%,bool


In [15]:
# combined_df.loc[combined_df.title.isnull()]
combined_df.dropna(subset=['year'], how='all', inplace = True)

In [16]:
display(DataFrameSummary(combined_df).summary().T)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,counts,uniques,missing,missing_perc,types
userId,100001.0,347.009,195.162,1.0,182.0,367.0,520.0,671.0,100001,671,0,0%,numeric
movieId,100001.0,12544.5,26358.4,1.0,1028.0,2406.0,5418.0,163949.0,100001,9063,0,0%,numeric
rating,100001.0,3.5436,1.05806,0.5,3.0,4.0,4.0,5.0,100001,10,0,0%,numeric
reviewDate,,,,,,,,,100001,78138,0,0%,date
title,,,,,,,,,100001,8833,0,0%,categorical
year,,,,,,,,,100001,103,0,0%,categorical
Action,100001.0,0.270557,0.44425,0.0,0.0,0.0,1.0,1.0,100001,2,0,0%,bool
Adventure,100001.0,0.220168,0.414362,0.0,0.0,0.0,0.0,1.0,100001,2,0,0%,bool
Animation,100001.0,0.0616994,0.24061,0.0,0.0,0.0,0.0,1.0,100001,2,0,0%,bool
Children,100001.0,0.0867991,0.281542,0.0,0.0,0.0,0.0,1.0,100001,2,0,0%,bool


## Additional feature engineering

In [17]:
add_datepart(combined_df, 'reviewDate', drop=False)
combined_df.head()

Unnamed: 0,userId,movieId,rating,reviewDate,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,reviewYear,reviewMonth,reviewWeek,reviewDay,reviewDayofweek,reviewDayofyear,reviewIs_month_end,reviewIs_month_start,reviewIs_quarter_end,reviewIs_quarter_start,reviewIs_year_end,reviewIs_year_start,reviewElapsed
0,1,31,2.5,2009-12-14 02:52:24,Dangerous Minds,1995,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759144
1,1,1029,3.0,2009-12-14 02:52:59,Dumbo,1941,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759179
2,1,1061,3.0,2009-12-14 02:53:02,Sleepers,1996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759182
3,1,1129,2.0,2009-12-14 02:53:05,Escape from New York,1981,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759185
4,1,1172,4.0,2009-12-14 02:53:25,Cinema Paradiso (Nuovo cinema Paradiso),1989,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759205


In [18]:
# see http://forums.fast.ai/t/understanding-columnarmodeldata-from-data-frame-from-rossman/8140/4?u=wgpubs
combined_df['dummy_cont'] = 0

In [19]:
combined_df.reset_index(inplace=True)
combined_df.to_feather(f'{PATH}/combined_df')

In [20]:
combined_df = pd.read_feather(f'{PATH}/combined_df')
combined_df.drop('index', axis=1, inplace=True)
combined_df.head()

Unnamed: 0,userId,movieId,rating,reviewDate,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,reviewYear,reviewMonth,reviewWeek,reviewDay,reviewDayofweek,reviewDayofyear,reviewIs_month_end,reviewIs_month_start,reviewIs_quarter_end,reviewIs_quarter_start,reviewIs_year_end,reviewIs_year_start,reviewElapsed,dummy_cont
0,1,31,2.5,2009-12-14 02:52:24,Dangerous Minds,1995,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759144,0
1,1,1029,3.0,2009-12-14 02:52:59,Dumbo,1941,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759179,0
2,1,1061,3.0,2009-12-14 02:53:02,Sleepers,1996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759182,0
3,1,1129,2.0,2009-12-14 02:53:05,Escape from New York,1981,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759185,0
4,1,1172,4.0,2009-12-14 02:53:25,Cinema Paradiso (Nuovo cinema Paradiso),1989,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,1260759205,0


In [21]:
cat_vars =  combined_df.drop(['rating', 'reviewDate', 'reviewElapsed', 'dummy_cont'], axis=1).columns.tolist()
cont_vars = ['dummy_cont']

In [22]:
cat_vars, cont_vars

(['userId',
  'movieId',
  'title',
  'year',
  'Action',
  'Adventure',
  'Animation',
  'Children',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Fantasy',
  'Film-Noir',
  'Horror',
  'IMAX',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Thriller',
  'War',
  'Western',
  'reviewYear',
  'reviewMonth',
  'reviewWeek',
  'reviewDay',
  'reviewDayofweek',
  'reviewDayofyear',
  'reviewIs_month_end',
  'reviewIs_month_start',
  'reviewIs_quarter_end',
  'reviewIs_quarter_start',
  'reviewIs_year_end',
  'reviewIs_year_start'],
 ['dummy_cont'])

In [23]:
for v in cat_vars: combined_df[v] = combined_df[v].astype('category').cat.as_ordered()
for v in cont_vars: combined_df[v] = combined_df[v].astype('float32')
    
dep_var = 'rating'

train_raw_df = combined_df[cat_vars + cont_vars + ['reviewDate', dep_var]]
train_raw_df.set_index('reviewDate', inplace=True)
train_raw_df.head()

Unnamed: 0_level_0,userId,movieId,title,year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,reviewYear,reviewMonth,reviewWeek,reviewDay,reviewDayofweek,reviewDayofyear,reviewIs_month_end,reviewIs_month_start,reviewIs_quarter_end,reviewIs_quarter_start,reviewIs_year_end,reviewIs_year_start,dummy_cont,rating
reviewDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1
2009-12-14 02:52:24,1,31,Dangerous Minds,1995,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,0.0,2.5
2009-12-14 02:52:59,1,1029,Dumbo,1941,0,0,1,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,0.0,3.0
2009-12-14 02:53:02,1,1061,Sleepers,1996,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,0.0,3.0
2009-12-14 02:53:05,1,1129,Escape from New York,1981,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,0.0,2.0
2009-12-14 02:53:25,1,1172,Cinema Paradiso (Nuovo cinema Paradiso),1989,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2009,12,51,14,0,348,False,False,False,False,False,False,0.0,4.0


In [24]:
# mapper not returned because we have no continuous vars and so no scaling!
train_df, y, nas = proc_df(train_raw_df, 'rating', do_scale=False)

n = len(train_df)
n, len(y), nas

(100001, 100001, {})

In [25]:
val_idxs = get_cv_idxs(n)

In [26]:
y_range = [0.0, 5.0]

In [27]:
md = ColumnarModelData.from_data_frame(PATH, val_idxs, train_df, y, cat_flds=cat_vars, bs=128)

In [62]:
cat_szs = [ (v, len(train_raw_df[v].cat.categories) + 1) for v in cat_vars ]
cat_szs

[('userId', 672),
 ('movieId', 9064),
 ('title', 8834),
 ('year', 104),
 ('Action', 3),
 ('Adventure', 3),
 ('Animation', 3),
 ('Children', 3),
 ('Comedy', 3),
 ('Crime', 3),
 ('Documentary', 3),
 ('Drama', 3),
 ('Fantasy', 3),
 ('Film-Noir', 3),
 ('Horror', 3),
 ('IMAX', 3),
 ('Musical', 3),
 ('Mystery', 3),
 ('Romance', 3),
 ('Sci-Fi', 3),
 ('Thriller', 3),
 ('War', 3),
 ('Western', 3),
 ('reviewYear', 23),
 ('reviewMonth', 13),
 ('reviewWeek', 54),
 ('reviewDay', 32),
 ('reviewDayofweek', 8),
 ('reviewDayofyear', 367),
 ('reviewIs_month_end', 3),
 ('reviewIs_month_start', 3),
 ('reviewIs_quarter_end', 3),
 ('reviewIs_quarter_start', 3),
 ('reviewIs_year_end', 3),
 ('reviewIs_year_start', 3)]

In [65]:
emb_szs = [ (csz, min(50, (csz+1) // 2)) for _, csz in cat_szs ]
emb_szs

[(672, 50),
 (9064, 50),
 (8834, 50),
 (104, 50),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (23, 12),
 (13, 7),
 (54, 27),
 (32, 16),
 (8, 4),
 (367, 50),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2),
 (3, 2)]

In [91]:
learner = md.get_learner(emb_szs, len(train_df.columns)-len(cat_vars), 
                  0.04, 1, [1000, 500], [0.001, 0.01], y_range=y_range)

In [92]:
# learner.lr_find()

In [93]:
# learner.sched.plot()

In [94]:
lr = 1e-3

In [95]:
learner.fit(lr, 3, wds=0.025, use_wd_sched=True, cycle_len=1)

[ 0.       0.77082  0.78661]                                 
[ 1.       0.74146  0.76414]                                 
[ 2.       0.68949  0.76075]                                 



In [96]:
learner.fit(lrs=lr, n_cycle=1, wds=0.025, use_wd_sched=True, cycle_len=3)

[ 0.       0.69963  0.77206]                                 
[ 1.       0.6333   0.75179]                                 
[ 2.       0.57193  0.77346]                                 



In [97]:
preds, actuals = learner.predict_with_targs()

In [100]:
for i in range(10): print(np.round(preds[i], 1), actuals[i])

[ 3.1] [ 3.5]
[ 3.7] [ 4.]
[ 3.3] [ 3.]
[ 4.] [ 1.]
[ 3.4] [ 4.]
[ 3.6] [ 4.]
[ 3.5] [ 3.]
[ 3.7] [ 3.]
[ 4.2] [ 4.]
[ 3.8] [ 4.]


In [46]:
tmp

<zip at 0x7f13c0597908>