In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import tensorflow as tf

  from ._conv import register_converters as _register_converters


## 1. Loading the data

In [22]:
users_title = ['userID', 'gender', 'age', 'occupationID', 'zip-code']
users_old = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
users_old.head()

Unnamed: 0,userID,gender,age,occupationID,zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [23]:
movies_title = ['movieID', 'title', 'genres']
movies_old = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
movies_old.head()

Unnamed: 0,movieID,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [24]:
ratings_title = ['userID','movieID', 'rating', 'timestamps']
ratings_old = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
ratings_old.head()

Unnamed: 0,userID,movieID,rating,timestamps
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 2. Dara Preprocessing:

In [25]:
users = pd.read_csv('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')

# Mapping gender to number[0, 1] 
users['gender'] = users['gender'].astype('category')
users['gender'] = users['gender'].cat.codes

# Mapping age range from age gourp to 0 - 7
age_mapping = {1: 0, 18: 1, 25: 2, 35: 3, 45: 4, 50: 5, 56: 6}
users['age'] = users['age'].map(age_mapping)

# Dropping the zip-code columm
# users.drop('zip-code')
users.drop('zip-code', axis=1, inplace=True)

print("User table after preprocessing")
users.head()

User table after preprocessing


Unnamed: 0,userID,gender,age,occupationID
0,1,0,0,10
1,2,1,6,16
2,3,1,2,15
3,4,1,4,7
4,5,1,2,20


In [27]:
movies = pd.read_csv('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')

# Mapping the genres to the fixed length (len = 18) padded list
genres_types = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime", 
                "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", 
                "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]

def genresToList(s):
    result = [genres_types.index(g) + 1 for g in s.split("|")]
    result = result + [0 for i in range(18 - len(result))]
    return result;
    
movies['genres'] = movies['genres'].apply(genresToList)

# Remove year from title and mapping the title to the fixed length (len = 18) padded list
def removeYear(s):
    return s[:-7]
movies['title'] = movies['title'].apply(removeYear)

title_words = set()
for title in movies["title"]:
    for word in title.split(" "):
        title_words.add(word)
title_words = list(title_words)
        
def titleToList(s):
    result = [title_words.index(g) + 1 for g in s.split(" ")]
    result = result + [0 for i in range(18 - len(result))]
    return result;

movies['title'] = movies['title'].apply(titleToList)

print("Movie table after preprocessing")
movies.head()

Unnamed: 0,movieID,title,genres
0,1,"[3927, 2292, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[3, 4, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,2,"[1190, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[2, 4, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,3,"[2792, 1629, 3788, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
3,4,"[961, 1569, 2139, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[5, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,"[1181, 4132, 3793, 2860, 2890, 1774, 0, 0, 0, ...","[5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [31]:
ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
ratings.drop('timestamps', axis=1, inplace=True)

print("Rating table after preprocessing")
ratings.head()

Rating table after preprocessing


Unnamed: 0,userID,movieID,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [41]:
data = pd.merge(pd.merge(ratings, users), movies)

X_df, y_df = data.drop('rating', axis=1), data['rating']
    
X = X_df.values
y = y_df.values

