# MovieLens 1M DataSet

* GroupLens Research (http://www.grouplens.org.node/73) provides number of collections of movie ratings data


In [6]:
import pandas as pd

mnames = ['movie_id','title','genres']
movies = pd.read_table('movies.dat',sep='::', header=None, names=mnames)

rnames = ['user_id','movie_id','rating','timestamp']
ratings = pd.read_table('ratings.dat', sep='::', header=None, names=rnames)

unames = ['user_id','gender','age','occupation','zip']
users = pd.read_table('users.dat', sep='::', header=None, names=unames)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.
  import sys
  import sys
  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [4]:
ratings[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [5]:
movies[:5]

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
users[:5]

Unnamed: 0,user_id,gender,age,occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


* **merge function** so that it will be easy to apply operations on single table instead on 3 tables


In [11]:
data = pd.merge(pd.merge(ratings,users),movies)
data[:5]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [10]:
data.loc[0]

user_id                                            1
movie_id                                        1193
rating                                             5
timestamp                                  978300760
gender                                             F
age                                                1
occupation                                        10
zip                                            48067
title         One Flew Over the Cuckoo's Nest (1975)
genres                                         Drama
Name: 0, dtype: object

* aggregating the one table value with other table attributies is straightforward.
* To get mean movie ratings for each film grouped by gender, using  **pivot_table method**:

In [22]:
mean_ratings=data.pivot_table('rating',index='title',columns='gender',aggfunc='mean')
mean_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024


* filter down the movies that received at least 250 ratings

In [29]:
ratings_by_title = data.groupby('title').size()
active_titles = ratings_by_title.index[ratings_by_title >= 250]
active_titles

Index([''burbs, The (1989)', '...And Justice for All (1979)',
       '10 Things I Hate About You (1999)', '101 Dalmatians (1961)',
       '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20 Dates (1998)', '20,000 Leagues Under the Sea (1954)',
       ...
       'Yellow Submarine (1968)', 'Yojimbo (1961)', 'You've Got Mail (1998)',
       'Young Frankenstein (1974)', 'Young Guns (1988)',
       'Young Guns II (1990)', 'Young Sherlock Holmes (1985)',
       'Your Friends and Neighbors (1998)', 'Zero Effect (1998)',
       'eXistenZ (1999)'],
      dtype='object', name='title', length=2019)

In [31]:
mean_ratings=mean_ratings.loc[active_titles]
mean_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),,
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215


In [33]:
top_male_ratings = mean_ratings.sort_values(by='M', ascending=False)
top_male_ratings[:5]

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.3147,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248


# Measuring Rating Disagreement

* Find the movies that are most divisive between male and female viwers


In [56]:
mean_ratings['diff'] = mean_ratings['M']-mean_ratings['F']
sorted_by_diff = mean_ratings.sort_values(by='diff')
sorted_by_diff[:5]

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
Little Women (1994),3.870588,3.321739,-0.548849
Steel Magnolias (1989),3.901734,3.365957,-0.535777


* Disagreement can be measured by the variance or standard variance of the ratings:

In [59]:
#std of rating grouped by title
rating_std_by_title = data.groupby('title')['rating'].std()

#Filter down to active_titles
rating_std_by_title = rating_std_by_title.loc[active_titles]

rating_std_by_title.sort_values(ascending=False)[:10]

title
Plan 9 from Outer Space (1958)                    1.455998
Beloved (1998)                                    1.372813
Godzilla 2000 (Gojira ni-sen mireniamu) (1999)    1.364700
Texas Chainsaw Massacre, The (1974)               1.332448
Dumb & Dumber (1994)                              1.321333
Crash (1996)                                      1.319636
Blair Witch Project, The (1999)                   1.316368
Natural Born Killers (1994)                       1.307198
Down to You (2000)                                1.305310
Cemetery Man (Dellamorte Dellamore) (1994)        1.300647
Name: rating, dtype: float64

In [1]:
b=[1,2,3]
b?