In [1]:
import os
import re
import pandas as pd
import numpy as np
from random import shuffle

In [2]:
DATA_DIR = 'ml-20m'
INPUT_RATINGS = 'ratings.csv'
INPUT_MOVIES = 'movies.csv'
INPUT_LINKS = 'links.csv'
OUTPUT_DATA = 'model_data.npy'
OUTPUT_MOVIES = 'movie_data.csv'

In [3]:
movies = pd.read_csv(
    os.path.join(DATA_DIR, INPUT_MOVIES),
    sep=',',
    usecols=['movieId', 'title']
)

In [4]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [5]:
links = pd.read_csv(
    os.path.join(DATA_DIR, INPUT_LINKS),
    sep=',',
    converters={'imdbId': lambda x: 'tt'+str(x)},
    usecols=['movieId', 'imdbId']
)

In [6]:
links.head()

Unnamed: 0,movieId,imdbId
0,1,tt0114709
1,2,tt0113497
2,3,tt0113228
3,4,tt0114885
4,5,tt0113041


In [7]:
ratings = pd.read_csv(
    os.path.join(DATA_DIR, INPUT_RATINGS),
    sep=',',
    usecols=['userId', 'movieId']
)

In [8]:
ratings.head()

Unnamed: 0,userId,movieId
0,1,2
1,1,29
2,1,32
3,1,47
4,1,50


In [9]:
ratings = ratings['movieId'].groupby(ratings.userId).apply(list).reset_index()

In [10]:
ratings.head()

Unnamed: 0,userId,movieId
0,1,"[2, 29, 32, 47, 50, 112, 151, 223, 253, 260, 2..."
1,2,"[3, 62, 70, 110, 242, 260, 266, 469, 480, 541,..."
2,3,"[1, 24, 32, 50, 160, 173, 175, 196, 223, 260, ..."
3,4,"[6, 10, 19, 32, 165, 329, 350, 356, 367, 368, ..."
4,5,"[2, 11, 17, 60, 62, 104, 110, 140, 141, 150, 2..."


In [11]:
for r in range(0, len(ratings['movieId'])):
    shuffle(ratings['movieId'][r])
    del ratings['movieId'][r][4:]

In [12]:
ratings.head()

Unnamed: 0,userId,movieId
0,1,"[1291, 4011, 223, 1321]"
1,2,"[1259, 2951, 1973, 1971]"
2,3,"[329, 1810, 2808, 1831]"
3,4,"[10, 596, 377, 520]"
4,5,"[588, 62, 1079, 589]"


In [13]:
movie_list = ratings['movieId']

In [14]:
movie_list.head()

0     [1291, 4011, 223, 1321]
1    [1259, 2951, 1973, 1971]
2     [329, 1810, 2808, 1831]
3         [10, 596, 377, 520]
4        [588, 62, 1079, 589]
Name: movieId, dtype: object

In [15]:
movieIds = movies['movieId']

In [16]:
new_movieId = dict(zip(movieIds,range(1,27279)))

In [17]:
data_finished = np.array([np.array([new_movieId[val] for val in row]) for row in movie_list])

In [18]:
data_finished[0:5]

array([[1264, 3918,  221, 1293],
       [1232, 2866, 1890, 1888],
       [ 326, 1734, 2723, 1751],
       [  10,  591,  374,  517],
       [ 583,   62, 1058,  584]])

In [19]:
np.save(OUTPUT_DATA, data_finished)

In [20]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [21]:
links.head()

Unnamed: 0,movieId,imdbId
0,1,tt0114709
1,2,tt0113497
2,3,tt0113228
3,4,tt0114885
4,5,tt0113041


In [22]:
merged_movies = movies.merge(links, on='movieId')

In [23]:
merged_movies.head()

Unnamed: 0,movieId,title,imdbId
0,1,Toy Story (1995),tt0114709
1,2,Jumanji (1995),tt0113497
2,3,Grumpier Old Men (1995),tt0113228
3,4,Waiting to Exhale (1995),tt0114885
4,5,Father of the Bride Part II (1995),tt0113041


In [24]:
merged_movies = merged_movies.drop('movieId', axis=1)

In [25]:
merged_movies.head()

Unnamed: 0,title,imdbId
0,Toy Story (1995),tt0114709
1,Jumanji (1995),tt0113497
2,Grumpier Old Men (1995),tt0113228
3,Waiting to Exhale (1995),tt0114885
4,Father of the Bride Part II (1995),tt0113041


In [26]:
merged_movies.insert(0, 'movieId', range(1,27279))

In [27]:
merged_movies.tail()

Unnamed: 0,movieId,title,imdbId
27273,27274,Kein Bund für's Leben (2007),tt0466713
27274,27275,"Feuer, Eis & Dosenbier (2002)",tt0277703
27275,27276,The Pirates (2014),tt3485166
27276,27277,Rentun Ruusu (2001),tt0249110
27277,27278,Innocence (2014),tt1724965


In [28]:
re1 = re.compile('\s\(\d\d\d\d\)')
re2 = re.compile('\s\(\d\d\d\d\)\s')

In [29]:
title_no_year = [t[:-7] if re1.match(t[-7:]) != None else t[:-8] if re2.match(t[-8:]) != None else t for t in merged_movies['title']]

In [30]:
merged_movies = merged_movies.drop('title', axis=1)
merged_movies.insert(1, 'title', title_no_year)
merged_movies.head()

Unnamed: 0,movieId,title,imdbId
0,1,Toy Story,tt0114709
1,2,Jumanji,tt0113497
2,3,Grumpier Old Men,tt0113228
3,4,Waiting to Exhale,tt0114885
4,5,Father of the Bride Part II,tt0113041


In [31]:
merged_movies.to_csv(OUTPUT_MOVIES, index=False)