In [19]:
import pandas as pd
import numpy as np
import requests
import zipfile
from tqdm import tqdm
import math
import os
import tensorflow as tf
import tensorflow.keras as keras
import unittest

In [2]:
def genre_to_int_list(genre_string):
    """
    Convert the list of genre names to a list of integer codes
    Args:
        genre_string: a string of genres names.
    """
    GENRES = ('Action', 'Adventure', 'Animation', 'Children', 'Comedy',
              'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
              'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller',
              'War', 'Western')
    GENRES_LC = tuple((x.lower() for x in GENRES))
    # convert to lower case
    genre_string_lc = genre_string.lower()
    genre_list = []
    for idx in range(len(GENRES_LC)):
        if GENRES_LC[idx] in genre_string_lc:
            genre_list.append(1)
        else:
            genre_list.append(0)
    return genre_list

In [3]:
def create_csv_data(source_dir='ml-1m', target_dir='movielens'):
    """
    convert original data to csv file and movie to the 'movielens' folder
    args:
        source: source_dir .dat directory
        target: target_dir .csv directory
    """
    if source_dir == 'ml-100k':
        convert_100k(source_dir, target_dir)
    if source_dir == 'ml-1m':
        convert_1m(source_dir, target_dir)

In [4]:
def convert_1m(source_dir, target_dir):
    # set input and output directories
    source_data = source_dir + '/ratings.dat'
    source_movie = source_dir + '/movies.dat'
    target_data = target_dir + '/ml-1m.ratings'
    target_movie = target_dir + '/ml-1m.movies'

    # from source data to target data
    col_names = ['uid', 'mid', 'rating', 'timestamp']
    data = pd.read_csv(source_data, sep='::', names=col_names)
    data.to_csv(target_data, index=False)

    # from source movies to target movies
    col_names = ['mid', 'title', 'genre']
    movies = pd.read_csv(source_movie, sep='::', names=col_names)

In [6]:
source_dir='ml-1m'
target_dir='movielens'
# set input and output directories
source_data = source_dir + '/ratings.dat'
source_movie = source_dir + '/movies.dat'
target_data = target_dir + '/ml-1m.ratings'
target_movie = target_dir + '/ml-1m.movies'

In [15]:
# from source data to target data
col_names = ['uid', 'mid', 'rating', 'timestamp']
data = pd.read_csv(source_data, sep='::', names=col_names)
data.to_csv(target_data, index=False)

# from source movies to target movies
col_names = ['mid', 'title', 'genre']
movies = pd.read_csv(source_movie, sep='::', names=col_names)

  This is separate from the ipykernel package so we can avoid doing imports until
  


In [16]:
# Processing genre column
new_df = movies['genre'].apply(genre_to_int_list)
new_df

0       [0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
3       [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
5       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
6       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
7       [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
9       [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
10      [0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...
11      [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
12      [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
13      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
14      [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
15      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
16      [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...
17      [0, 0,

In [24]:
a = new_df.apply(lambda x:pd.Series(x))

In [28]:
a.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,mid
0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,2
2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,3
3,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,4
4,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5


In [27]:
a['mid'] = a.index + 1

In [36]:
col_names = ['Action', 'Adventure',
                     'Animation', 'Children', 'Comedy', 'Crime',
                     'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                     'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                     'Thriller', 'War', 'Western', 'mid']
a.columns = col_names
a = a[['mid', 'Action', 'Adventure',
                     'Animation', 'Children', 'Comedy', 'Crime',
                     'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
                     'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi',
                     'Thriller', 'War', 'Western']]

In [37]:
a.head()

Unnamed: 0,mid,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [2]:
user_input = tf.random_uniform(shape=(100,1), maxval=1000, dtype=tf.int32)
item_input = tf.random_uniform(shape=(100,1), maxval=1000, dtype=tf.int32)

In [22]:
user_input.shape == item_input.shape

True