In [1]:
import numpy as np
import pandas as pd
import keras
import datetime as dt
import time

Using TensorFlow backend.


# First, we get sessions info

In [2]:
ratings_df = pd.read_csv('./data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [3]:
# ALL TRAIN SET
# Get only ratings between January 2008 to March 2013
all_train_start = "09/01/1995"
all_train_end = "01/03/2013"
all_train_start_ts = time.mktime(dt.datetime.strptime(all_train_start, "%d/%m/%Y").timetuple())
all_train_end_ts = time.mktime(dt.datetime.strptime(all_train_end, "%d/%m/%Y").timetuple())

all_train_data = ratings_df.drop(['rating'], axis=1)
# in date range
all_train_data = all_train_data.loc[(all_train_data['timestamp'] >= all_train_start_ts) & (all_train_data['timestamp'] <= all_train_end_ts)]
# only users 5 < rated_movies < 101
all_train_data = all_train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

# RECENT TRAIN SET
# Get only ratings between January 2008 to March 2013
train_start = "01/01/2008"
train_end = "01/03/2013"
train_start_ts = time.mktime(dt.datetime.strptime(train_start, "%d/%m/%Y").timetuple())
train_end_ts = time.mktime(dt.datetime.strptime(train_end, "%d/%m/%Y").timetuple())

train_data = ratings_df.drop(['rating'], axis=1)
# in date range
train_data = train_data.loc[(train_data['timestamp'] >= train_start_ts) & (train_data['timestamp'] <= train_end_ts)]
# only users 5 < rated_movies < 101
train_data = train_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

# DEV SET
# Get only ratings between April 2014 to April 2015
dev_start = "01/04/2013"
dev_end = "01/04/2014"
dev_start_ts = time.mktime(dt.datetime.strptime(dev_start, "%d/%m/%Y").timetuple())
dev_end_ts = time.mktime(dt.datetime.strptime(dev_end, "%d/%m/%Y").timetuple())

dev_data = ratings_df.drop(['rating'], axis=1)
# in date range
dev_data = dev_data.loc[(dev_data['timestamp'] >= dev_start_ts) & (dev_data['timestamp'] <= dev_end_ts)]
# only users 5 < rated_movies < 101
dev_data = dev_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

# TEST SET
# Get only ratings between April 2015 to April 2016
test_start = "02/04/2014"
test_end = "01/04/2015"
test_start_ts = time.mktime(dt.datetime.strptime(test_start, "%d/%m/%Y").timetuple())
test_end_ts = time.mktime(dt.datetime.strptime(test_end, "%d/%m/%Y").timetuple())

test_data = ratings_df.drop(['rating'], axis=1)
# in date range
test_data = test_data.loc[(test_data['timestamp'] >= test_start_ts) & (test_data['timestamp'] <= test_end_ts)]
# only users 5 < rated_movies < 101
test_data = test_data.groupby("userId").filter(lambda x: len(x) > 5 and len(x) < 101)

In [5]:
print(all_train_data.shape)
print(len(all_train_data['userId'].unique()))

print(train_data.shape)
print(len(train_data['userId'].unique()))

print(dev_data.shape)
print(len(dev_data['userId'].unique()))

print(test_data.shape)
print(len(test_data['userId'].unique()))

(3749420, 3)
80466
(884615, 3)
19850
(225691, 3)
5747
(215606, 3)
5270


In [6]:
all_train_data.columns = ['SessionId', 'ItemId', 'Time']
train_data.columns = ['SessionId', 'ItemId', 'Time']
dev_data.columns = ['SessionId', 'ItemId', 'Time']
test_data.columns = ['SessionId', 'ItemId', 'Time']
train_data.head()

Unnamed: 0,SessionId,ItemId,Time
1940,18,186,1267347706
1954,18,858,1236356241
1955,18,912,1283426281
1960,18,1221,1236356224
1961,18,1230,1236293194


In [7]:
all_train_data.to_csv('./data/all_train.csv', sep='\t', index=False)
train_data.to_csv('./data/train.csv', sep='\t', index=False)
dev_data.to_csv('./data/dev.csv', sep='\t', index=False)
test_data.to_csv('./data/test.csv', sep='\t', index=False)

# Now, we extract semantic info

## We don't care about each user's film perception. We aggregate all info associated to each film. Text is stemmed and processed.

### The model receives each film's text vector and encodes it using GloVe. We then use this vector as a feature to train the whole model over. The idea is to capture the community feeling about each one, and use that information to recommend in a better way. 

### Also, during the course of a "session", in this case, it might be that particular films make a big impact on the next one to watch, or maybe down the line. An attention mechanism might be able to detect such patterns and then act on them.

In [12]:
tags_df = pd.read_csv('./data/tags.csv')
tags_df = tags_df[['movieId', 'tag']]
tags_df.head()

Unnamed: 0,movieId,tag
0,4141,Mark Waters
1,208,dark hero
2,353,dark hero
3,521,noir thriller
4,592,dark hero


In [13]:
genres_df = pd.read_csv('./data/movies.csv')
genres_df = genres_df[['movieId', 'genres']]
genres_df.head()

Unnamed: 0,movieId,genres
0,1,Adventure|Animation|Children|Comedy|Fantasy
1,2,Adventure|Children|Fantasy
2,3,Comedy|Romance
3,4,Comedy|Drama|Romance
4,5,Comedy


In [117]:
# From tags take movieId, tag, then join all.

# From genres, take movieId, genres (each one, 
# separate by whitespace), then join all

# Then merge both tables
import re
from collections import defaultdict
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('words')
engwords = set(nltk.corpus.words.words())
stops = set(stopwords.words("english"))



[nltk_data] Downloading package stopwords to
[nltk_data]     /home/pcerdam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/pcerdam/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [128]:
genres_dict = defaultdict(str)

for mId in genres_df['movieId'].unique():
    for item in genres_df.loc[genres_df['movieId'] == mId].iterrows():
        genres_dict[item[1][0]] += " ".join(str(item[1][1]).split("|"))

In [134]:
# we merge genre info with all tags on text_dict
text_dict = defaultdict(set)

for mId in genres_df['movieId'].unique():
    for item in tags_df.loc[tags_df['movieId'] == mId].iterrows():
        [text_dict[item[1][0]].add(tag.lower()) for tag in filter(None, re.split("\W+", str(item[1][1]))) if tag.lower() not in stops and tag.lower() in engwords and len(tag) > 2]
        [text_dict[item[1][0]].add(genre.lower()) for genre in genres_dict[item[1][0]].split(" ")]

# Describir dataset. Cantidad promedio tags, etc. IMPORTANTE.

## Some examples:

In [137]:
# Star Trek, Generations
text_dict[329]

{'adventure',
 'based',
 'beloved',
 'brent',
 'character',
 'drama',
 'enterprise',
 'far',
 'franchise',
 'future',
 'gene',
 'movie',
 'odd',
 'pointlessly',
 'ruined',
 'sci-fi',
 'seen',
 'sequel',
 'show',
 'space',
 'star',
 'time',
 'travel',
 'trek',
 'video'}

In [138]:
# Batman
text_dict[592]

{'action',
 'alter',
 'atmospheric',
 'based',
 'batman',
 'best',
 'bizarre',
 'bleak',
 'blow',
 'book',
 'bookie',
 'burton',
 'city',
 'classic',
 'comic',
 'creepy',
 'crime',
 'critic',
 'dark',
 'design',
 'dimensional',
 'double',
 'dreamlike',
 'ego',
 'ending',
 'eric',
 'film',
 'franchise',
 'future',
 'get',
 'good',
 'goodness',
 'great',
 'hero',
 'honest',
 'jack',
 'joker',
 'kim',
 'lavish',
 'life',
 'moody',
 'movie',
 'noir',
 'nostalgia',
 'paced',
 'realistic',
 'revenge',
 'romance',
 'see',
 'seen',
 'series',
 'serious',
 'slow',
 'spherical',
 'star',
 'super',
 'superhero',
 'tense',
 'thriller',
 'top',
 'vigilante',
 'villain',
 'want',
 'way'}

In [140]:
# Save data
import pickle

with open("./data/tags.pickle", 'wb') as file:
    pickle.dump(text_dict, file)