In [1]:
import pandas as pd

users_columns = ['user_id', 'gender', 'age', 'Occupation', 'zip']
df_users = pd.read_table('users.dat', sep='::', header=None, names=users_columns, engine='python')
df_users.head()

Unnamed: 0,user_id,gender,age,Occupation,zip
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [2]:
ratings_columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_table('ratings.dat', sep='::', header=None, names=ratings_columns, engine='python')

movies_columns = ['movie_id', 'title', 'genres']
df_movies = pd.read_table('movies.dat', sep='::', header=None, names=movies_columns, engine='python')

In [3]:
df_merged = pd.merge(pd.merge(df_ratings, df_users), df_movies)
df_merged.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,Occupation,zip,title,genres
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama


In [4]:
df_users.age.min()

1

In [5]:
df_users[df_users.age == 1].user_id.count()

222

In [6]:
df_users[df_users.age == 1].user_id.count() / df_users.user_id.count()

0.036754966887417216

In [7]:
df_users.age.unique()

array([ 1, 56, 25, 45, 50, 35, 18])

In [8]:
df_mean_ratings = df_merged.pivot_table('rating', index='title', columns='gender', aggfunc='mean')
df_mean_ratings.head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"$1,000,000 Duck (1971)",3.375,2.761905
'Night Mother (1986),3.388889,3.352941
'Til There Was You (1997),2.675676,2.733333
"'burbs, The (1989)",2.793478,2.962085
...And Justice for All (1979),3.828571,3.689024
1-900 (1994),2.0,3.0
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215
12 Angry Men (1957),4.184397,4.328421


In [9]:
ratings_by_title = df_merged.groupby('title').size()
ratings_by_title.head()

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
dtype: int64

In [10]:
# it's the same as:
df_merged.groupby('title').user_id.count().head()

title
$1,000,000 Duck (1971)            37
'Night Mother (1986)              70
'Til There Was You (1997)         52
'burbs, The (1989)               303
...And Justice for All (1979)    199
Name: user_id, dtype: int64

In [11]:
active_titles = ratings_by_title.index[ratings_by_title >= 200]
active_titles

Index([''burbs, The (1989)', '10 Things I Hate About You (1999)',
       '101 Dalmatians (1961)', '101 Dalmatians (1996)', '12 Angry Men (1957)',
       '13th Warrior, The (1999)', '2 Days in the Valley (1996)',
       '20,000 Leagues Under the Sea (1954)', '2001: A Space Odyssey (1968)',
       '2010 (1984)',
       ...
       'Year of Living Dangerously (1982)', 'Yellow Submarine (1968)',
       'Yojimbo (1961)', 'You've Got Mail (1998)', 'Young Frankenstein (1974)',
       'Young Guns (1988)', 'Young Guns II (1990)',
       'Young Sherlock Holmes (1985)', 'Zero Effect (1998)',
       'eXistenZ (1999)'],
      dtype='object', name='title', length=1426)

In [12]:
df_mean_ratings = df_mean_ratings.loc[active_titles]
df_mean_ratings.head(10)

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",2.793478,2.962085
10 Things I Hate About You (1999),3.646552,3.311966
101 Dalmatians (1961),3.791444,3.5
101 Dalmatians (1996),3.24,2.911215
12 Angry Men (1957),4.184397,4.328421
"13th Warrior, The (1999)",3.112,3.168
2 Days in the Valley (1996),3.488889,3.244813
"20,000 Leagues Under the Sea (1954)",3.670103,3.709205
2001: A Space Odyssey (1968),3.825581,4.129738
2010 (1984),3.446809,3.413712


In [13]:
top_female_ratings = df_mean_ratings.sort_values(by='F', ascending=False)
top_female_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Close Shave, A (1995)",4.644444,4.473795
"Wrong Trousers, The (1993)",4.588235,4.478261
"General, The (1927)",4.575758,4.32948
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),4.57265,4.464589
Wallace & Gromit: The Best of Aardman Animation (1996),4.563107,4.385075


In [14]:
top_male_ratings = df_mean_ratings.sort_values(by='M', ascending=False)
top_male_ratings.head()

gender,F,M
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"Godfather, The (1972)",4.3147,4.583333
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),4.481132,4.576628
"Shawshank Redemption, The (1994)",4.539075,4.560625
Raiders of the Lost Ark (1981),4.332168,4.520597
"Usual Suspects, The (1995)",4.513317,4.518248


#### Rating disagreement between F and M

In [15]:
df_mean_ratings['diff'] = df_mean_ratings['M'] - df_mean_ratings['F']
df_mean_ratings.head()

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"'burbs, The (1989)",2.793478,2.962085,0.168607
10 Things I Hate About You (1999),3.646552,3.311966,-0.334586
101 Dalmatians (1961),3.791444,3.5,-0.291444
101 Dalmatians (1996),3.24,2.911215,-0.328785
12 Angry Men (1957),4.184397,4.328421,0.144024


In [16]:
sorted_by_diff = df_mean_ratings.sort_values(by='diff')
sorted_by_diff.head()

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dirty Dancing (1987),3.790378,2.959596,-0.830782
"To Wong Foo, Thanks for Everything! Julie Newmar (1995)",3.486842,2.795276,-0.691567
Jumpin' Jack Flash (1986),3.254717,2.578358,-0.676359
Grease (1978),3.975265,3.367041,-0.608224
"Relic, The (1997)",3.309524,2.723077,-0.586447


In [17]:
sorted_by_diff[::-1].head()

gender,F,M,diff
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
"Good, The Bad and The Ugly, The (1966)",3.494949,4.2213,0.726351
"Kentucky Fried Movie, The (1977)",2.878788,3.555147,0.676359
Up in Smoke (1978),2.944444,3.585227,0.640783
Dumb & Dumber (1994),2.697987,3.336595,0.638608
"Longest Day, The (1962)",3.411765,4.031447,0.619682


In [20]:
# Standard deviation of ratings grouped by title
rating_std_by_title = df_merged.groupby('title')['rating'].std()
rating_std_by_title = rating_std_by_title.loc[active_titles]
rating_std_by_title.sort_values(ascending=False).head(10)

title
Plan 9 from Outer Space (1958)         1.455998
Texas Chainsaw Massacre, The (1974)    1.332448
Dumb & Dumber (1994)                   1.321333
Blair Witch Project, The (1999)        1.316368
Natural Born Killers (1994)            1.307198
Idle Hands (1999)                      1.298439
Transformers: The Movie, The (1986)    1.292917
Very Bad Things (1998)                 1.280074
Tank Girl (1995)                       1.277695
Hellraiser: Bloodline (1996)           1.271939
Name: rating, dtype: float64