# Learning the Embedding with the use of Neural Networks
W'll use Neural Collaborative Filtering (https://arxiv.org/pdf/1708.05031.pdf) to create a NN that learns the embedding of the data

## Necessary imports

In [1]:
import pandas as pd
import re
import numpy as np
from datetime import datetime
## for plotting
import matplotlib.pyplot as plt
import seaborn as sns
## for machine learning
from sklearn import preprocessing, metrics

## Importing the data

In [2]:
movies = pd.read_excel('data/movie_e.xlsx')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [4]:
user_rating = pd.read_csv('data/rating.csv')

In [5]:
user_rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


#### Prepare movies data:

In [6]:
movies = movies[~movies["genres"].isna()]
movies["entry_id"] = range(0, len(movies))
regexp = re.compile(r'[(]\d*?[)]') # mathces (number)

# remove the release year of movies, and save it in another column
movies["movie_name"] = movies["title"].apply(lambda x:
                                             re.sub(regexp, "", x).strip())
# print(movies.loc[15646])
# get the release year and save it in another column

movies["date"] = movies["title"].apply(lambda x: int(x.split("(")[-1].replace(")","").replace("\"", "").strip()) if regexp.search(x) else np.nan)

movies["date"] = movies["date"].fillna(9999)
movies["old"] = movies["date"].apply(lambda x: 1 if x < 2000 else 0)

In [7]:
movies

Unnamed: 0,movieId,title,genres,entry_id,movie_name,date,old
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0,Toy Story,1995.0,1
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,Jumanji,1995.0,1
2,3,Grumpier Old Men (1995),Comedy|Romance,2,Grumpier Old Men,1995.0,1
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,3,Waiting to Exhale,1995.0,1
4,5,Father of the Bride Part II (1995),Comedy,4,Father of the Bride Part II,1995.0,1
...,...,...,...,...,...,...,...
27273,131254,Kein Bund fÃ¼r's Leben (2007),Comedy,27273,Kein Bund fÃ¼r's Leben,2007.0,0
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,27274,"Feuer, Eis & Dosenbier",2002.0,0
27275,131258,The Pirates (2014),Adventure,27275,The Pirates,2014.0,0
27276,131260,Rentun Ruusu (2001),(no genres listed),27276,Rentun Ruusu,2001.0,0


#### Prepare user-rating data:

In [8]:
user_rating["user"] = user_rating["userId"].apply(lambda x: x-1)
# FOR CONTEXT_BASED:
# user_rating["timestamp"] = user_rating["timestamp"].apply(lambda x: datetime.fromtimestamp(datetime.strptime(x, "%Y-%m-%d %H:%M:%S")))
# user_rating["daytime"] = user_rating["timestamp"].apply(lambda x: 1 if 6<int(x.strftime("%H"))<20 else 0)
# user_rating["weekend"] = user_rating["timestamp"].apply(lambda x: 1 if x.weekday() in [5,6] else 0)
user_rating = user_rating.merge(movies[["movieId","entry_id"]], how="left")
user_rating = user_rating.rename(columns={"rating":"y"})

In [None]:
movies = movies[["entry_id", "movie_name", "old", "genres"]].set_index("entry_id")
user_rating = user_rating[["user", "entry_id", "timestamp", "y"]]