In [6]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns = 70
sns.set_style("whitegrid")
% matplotlib inline

In [7]:
% cat ./ml-100k/README

SUMMARY & USAGE LICENSE

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.
 
This data set consists of:
	* 100,000 ratings (1-5) from 943 users on 1682 movies. 
	* Each user has rated at least 20 movies. 
        * Simple demographic info for the users (age, gender, occupation, zip)

The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th, 
1997 through April 22nd, 1998. This data has been cleaned up - users
who had less than 20 ratings or did not have complete demographic
information were removed from this data set. Detailed descriptions of
the data file can be found at the end of this file.

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any resear

In [9]:
u_data_df = pd.read_csv("ml-100k/u.data", delimiter='\t', header=None)
u_data_df.columns = ["user_id", "item_id", "rating", "timestamp"]
u_data_df.head(1)

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949


In [20]:
u_info_df = pd.read_csv("ml-100k/u.info", delimiter='\t', header=None)
u_info_df 

Unnamed: 0,0
0,943 users
1,1682 items
2,100000 ratings


In [36]:
update_columns_set = [
    "movie_id","movie_title","release_date","video_release_date",
    "IMDb_URL","unknown","Action","Adventure","Animation",
    "Children's","Comedy","Crime","Documentary","Drama","Fantasy",
    "Film-Noir","Horror","Musical","Mystery","Romance","Sci-Fi",
    "Thriller","War","Western"
]
# len(update_columns_set)
u_item_df = pd.read_csv("ml-100k/u.item", delimiter='|', header=None, encoding="ISO-8859-1")
u_item_df.columns = update_columns_set
u_item_df

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,01-Jan-1995,,http://us.imdb.com/Title?Yao+a+yao+yao+dao+wai...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
6,7,Twelve Monkeys (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Twelve%20Monk...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
7,8,Babe (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Babe%20(1995),0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0
8,9,Dead Man Walking (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Dead%20Man%20...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
9,10,Richard III (1995),22-Jan-1996,,http://us.imdb.com/M/title-exact?Richard%20III...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0


In [41]:
# 特定のカラムがNaNで無いものを調べる
u_item_df[u_item_df["video_release_date"].notnull()]

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western


In [43]:
% cat ml-100k/u.genre

unknown|0
Action|1
Adventure|2
Animation|3
Children's|4
Comedy|5
Crime|6
Documentary|7
Drama|8
Fantasy|9
Film-Noir|10
Horror|11
Musical|12
Mystery|13
Romance|14
Sci-Fi|15
Thriller|16
War|17
Western|18



In [11]:
datetime.fromtimestamp(u_data_df["timestamp"][0])

datetime.datetime(1997, 12, 5, 0, 55, 49)

In [47]:
update_columns_set = ["user_id", "age", "gender", "occupation", "zip_code"]
u_user_df = pd.read_csv("ml-100k/u.user", delimiter="|", header=None)
u_user_df.columns = update_columns_set
u_user_df.head(1)

Unnamed: 0,user_id,age,gender,occupation,zip_code
0,1,24,M,technician,85711


In [50]:
u1_base_df =  pd.read_csv("ml-100k/u1.base", delimiter="\t", header=None)
u1_base_df.columns = ["user_id", "item_id", "rating", "timestamp"]
u1_base_df.head(1)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,5,874965758


In [51]:
u1_test_df =  pd.read_csv("ml-100k/u1.test", delimiter="\t", header=None)
u1_test_df.columns = ["user_id", "item_id", "rating", "timestamp"]
u1_test_df.head(1)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,6,5,887431973
