### Reference
* https://arxiv.org/abs/1606.07792
* https://grouplens.org/datasets/movielens/1m/
* https://keras.io/examples/structured_data/wide_deep_cross_networks/
* https://github.com/floraxhuang/Movie-Recommendation-System/blob/master/Deep%20and%20Wide%20Model.ipynb

In [8]:
import numpy as np
import pandas as pd
from collections import defaultdict

### [Option] download MovieLens 1M dataset

In [None]:
import os
import wget
import zipfile

dirPath = '../dataset'
zipFilePath = os.path.join(dirPath, 'ml-1m.zip')
remoteRrl = 'https://files.grouplens.org/datasets/movielens/ml-1m.zip'

if not os.path.exists(dirPath):
    os.makedirs(dirPath)
    
# download
wget.download(remoteRrl, zipFilePath)

# unzip files
with zipfile.ZipFile(zipFilePath, 'r') as zipRef:
    zipRef.extractall(dirPath)

## load dataset

In [2]:
df_ratings = pd.read_csv('../dataset/ml-1m/ratings.dat', sep='::', engine='python', names=['UserID', 'MovieID', 'Rating', 'Timestamp'], header=None)
df_movies = pd.read_csv('../dataset/ml-1m/movies.dat', sep='::', engine='python', names=['MovieID', 'Title', 'Genres'], header=None)
df_users = pd.read_csv('../dataset/ml-1m/users.dat', sep='::', engine='python', names=['UserID', 'Gender', 'Age', 'Occupation', 'ZipCode'], header=None)

numOfUsers = df_ratings.UserID.nunique()
numOfItems = df_ratings.MovieID.nunique()

In [3]:
# preprocessing
df_movies['Genres'] = df_movies.apply(lambda row : row['Genres'].split("|")[0], axis=1)
df_movies['MovieYear'] = df_movies.apply(lambda row : int(row['Title'].split("(")[-1][:-1]),axis=1)

age_bins = [0, 18, 25, 35, 45, 50, 56]
age_labels = [i for i in range(len(age_bins) - 1)]
df_users['Age'] = pd.cut(df_users['Age'], age_bins, labels=age_labels)
df_users['Gender'].replace({'F':0,'M':1}, inplace=True)

df_total = pd.merge(df_ratings, df_movies, how='left', on="MovieID")
df_total = pd.merge(df_total, df_users, how='left', on="UserID")
df_total.drop(['Title'], axis=1, inplace=True)
df_total.drop(['ZipCode'], axis=1, inplace=True)
df_total.drop(['Timestamp'], axis=1, inplace=True)

In [9]:
idx2name = defaultdict(dict)
for col_name in df_total.select_dtypes(include=['object']).columns:
    idx2name[col_name] = dict(enumerate(list(df_total[col_name].astype('category').cat.categories)))
    df_total[col_name] = df_total[col_name].astype('category').cat.codes # convert categories to numbers

In [10]:
df_total.head()

Unnamed: 0,UserID,MovieID,Rating,Genres,MovieYear,Gender,Age,Occupation
0,1,1193,5,7,1975,0,0,10
1,1,661,3,2,1996,0,0,10
2,1,914,3,11,1964,0,0,10
3,1,3408,4,7,2000,0,0,10
4,1,2355,5,2,1998,0,0,10


In [None]:
wide_cols = ['UserID', 'MovieID', 'MovieYear', 'Gender', 'Age', 'Occupation']
embeddings_cols = {'UserID': 100, 'MovieID': 100, 'Genres': 4}
continuous_cols = ['MovieYear', 'Gender', 'Age', 'Occupation']
deep_cols = list(embeddings_cols.keys()) + continuous_cols
# crosses_cols = ()
target = 'Rating'

In [None]:
Y = np.array(df_total['Rating'])

In [None]:
all_cols = list(set(wide_cols + deep_cols))
df_tmp = df_total.copy()[all_cols]

In [None]:
idx2name_generes