In [1]:
import pandas as pd
from src.constant import *

# Data Source

`movielens/1m-ratings`: This dataset contains 1,000,209 anonymous ratings of approximately 3,900 movies made by 6,040 MovieLens users who joined MovieLens. It is the largest dataset that contains demographic data.
Each user has rated at least 20 movies. Ratings are in whole-star increments. In demographic data, age values are divided into ranges and the lowest age value for each range is used in the data instead of the actual values.

`movielens/1m-movies`: This dataset contains data of approximately 3,900 movies rated in the 1m dataset.

# Load data
Download the MovieLens 1M dataset from [groupLens](https://grouplens.org/datasets/movielens/1m/)

In [3]:
movie_df = pd.read_csv("data/raw/ml-1m/movies.dat", delimiter='::', header=None, engine='python', encoding='latin-1')
movie_df.columns = MOVIE_COLUMN_NAMES
print(movie_df.shape)
movie_df.head()

(3883, 3)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Rating
UserID::MovieID::Rating::Timestamp

- UserIDs range between 1 and 6040 
- MovieIDs range between 1 and 3952
- Ratings are made on a 5-star scale (whole-star ratings only)
- Timestamp is represented in seconds since the epoch as returned by time(2)
- Each user has at least 20 ratings

In [4]:
rating_df = pd.read_csv("data/raw/ml-1m/ratings.dat", delimiter='::', header=None, engine='python', encoding='latin-1')
rating_df.columns = RATING_COLUMN_NAMES
print(rating_df.shape)
rating_df.head()

(1000209, 4)


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## User
All demographic information is provided voluntarily by the users and is
not checked for accuracy.  Only users who have provided some demographic
information are included in this data set.
- Gender is denoted by a "M" for male and "F" for female
- Age is chosen from the following ranges:
	*  1:  "Under 18"
	* 18:  "18-24"
	* 25:  "25-34"
	* 35:  "35-44"
	* 45:  "45-49"
	* 50:  "50-55"
	* 56:  "56+"

In [6]:
user_df = pd.read_csv("data/raw/ml-1m/users.dat", delimiter='::', header=None, engine='python', encoding='latin-1')
user_df.columns = USER_COLUMN_NAMES
# map occupation to occupation name
user_df[USER_COLUMN_NAMES[3]] = user_df[USER_COLUMN_NAMES[3]].map(OCCUPATION_MAP)
print(user_df.shape)
user_df.head()

(6040, 5)


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,K-12 student,48067
1,2,M,56,self-employed,70072
2,3,M,25,scientist,55117
3,4,M,45,executive/managerial,2460
4,5,M,25,writer,55455


In [9]:
# join three tables
rating_df = rating_df.merge(user_df, on=USER_COLUMN_NAMES[0], how='left')
full_df = rating_df.merge(movie_df, on=MOVIE_COLUMN_NAMES[0], how='left')
print(full_df.shape)
full_df.head()

(1000209, 10)


Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres
0,1,1193,5,978300760,F,1,K-12 student,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,K-12 student,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,K-12 student,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,K-12 student,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,K-12 student,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy


In [10]:
full_df[full_df[USER_COLUMN_NAMES[0]] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres
0,1,1193,5,978300760,F,1,K-12 student,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,1,661,3,978302109,F,1,K-12 student,48067,James and the Giant Peach (1996),Animation|Children's|Musical
2,1,914,3,978301968,F,1,K-12 student,48067,My Fair Lady (1964),Musical|Romance
3,1,3408,4,978300275,F,1,K-12 student,48067,Erin Brockovich (2000),Drama
4,1,2355,5,978824291,F,1,K-12 student,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy
5,1,1197,3,978302268,F,1,K-12 student,48067,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
6,1,1287,5,978302039,F,1,K-12 student,48067,Ben-Hur (1959),Action|Adventure|Drama
7,1,2804,5,978300719,F,1,K-12 student,48067,"Christmas Story, A (1983)",Comedy|Drama
8,1,594,4,978302268,F,1,K-12 student,48067,Snow White and the Seven Dwarfs (1937),Animation|Children's|Musical
9,1,919,4,978301368,F,1,K-12 student,48067,"Wizard of Oz, The (1939)",Adventure|Children's|Drama|Musical


In [8]:
rating_df[rating_df[RATING_COLUMN_NAMES[0]] == 1]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368
