### MovieLens Data Set

In [37]:
import pandas as pd

Go to the link below and download the `ml-latest-small.zip` dataset from the **Recommended for Education and Development** section. Read in the `movies`, `ratings`, and `tags` datasets into differnet dataframes. Print out for each how many total records there are. Add a few cells if necessary and show the head of each dataframe to get a feel what the data looks like.

Data: https://grouplens.org/datasets/movielens/

In [38]:
dir_path = "../../data/ml-latest-small"
movies = pd.read_csv(f"{dir_path}/movies.csv")
ratings = pd.read_csv(f"{dir_path}/ratings.csv")
tags = pd.read_csv(f"{dir_path}/tags.csv")

print(f"movies: {len(movies)}")
print(f"ratings: {len(ratings)}")
print(f"tags: {len(tags)}")

movies: 9742
ratings: 100836
tags: 3683


In [39]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [40]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [41]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [42]:
# Which user has submitted the most ratings?

(ratings
     [["userId", "movieId"]]
     .groupby("userId").count()
     .rename(columns = {"movieId":"n_movies"})
     .sort_values("n_movies", ascending = False)
     .head()
)

Unnamed: 0_level_0,n_movies
userId,Unnamed: 1_level_1
414,2698
599,2478
474,2108
448,1864
274,1346


In [91]:
# What 10 tags are the most frequent?

(tags
    [["tag", "movieId"]]
    .groupby("tag").count()
    .rename(columns = {"movieId":"n_movies"})
    .sort_values("n_movies", ascending = False)
    .head(10)
)

Unnamed: 0_level_0,n_movies
tag,Unnamed: 1_level_1
In Netflix queue,131
atmospheric,36
thought-provoking,24
superhero,24
Disney,23
surreal,23
funny,23
religion,22
dark comedy,21
sci-fi,21


In [44]:
# How many different rating levels are there? What is the average movie rating?

ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [45]:
# Which 3 movies have the most 5.0 ratings of the entire set. Show only the title and number of 
# 5.0 ratings in your output set.

top_ratings = (ratings[ratings.rating == 5.0]
    [["movieId", "rating"]]
    .groupby("movieId").count()
    .rename(columns = {"rating": "n_rating5"})
)
(movies
    .merge(
        right = top_ratings,
        on = "movieId",
        how = "inner"
    )
    .sort_values("n_rating5", ascending = False)
    [["title", "n_rating5"]]
    .head(3)
)

Unnamed: 0,title,n_rating5
174,"Shawshank Redemption, The (1994)",153
161,Pulp Fiction (1994),123
198,Forrest Gump (1994),116


In [90]:
# Of all movies that have been tagged as "time travel", "sci-fi", "space", "twist ending"; show 
# which 5 are the lowest average rated? Show only the title and average movie rating for each.
# Hint: Check out the isin() function on your cheat sheet in the Subset Obervations section.

tags_set = ["time travel", "sci-fi", "space", "twist ending"]
tags_interest = (tags[tags.tag.isin(tags_set)]
    .drop(columns = ["userId", "timestamp", "tag"])
    .drop_duplicates()
)

mean_ratings = (ratings
    .drop(columns = ["userId", "timestamp"])
    .groupby("movieId").mean()
    .rename(columns = {"rating":"mean_rating"})
)

(movies
    .merge(
        right = tags_interest,
        on = "movieId",
        how = "inner"
    )
    .drop(columns = ["genres"])
    .merge(
        right = mean_ratings,
        on = "movieId",
        how = "left"
    )
    .sort_values("mean_rating", ascending = True)
    .drop(columns = ["movieId"])
    .head()
)

Unnamed: 0,title,mean_rating
30,SpaceCamp (1986),2.166667
46,Green Lantern (2011),2.35
19,Armageddon (1998),3.054348
32,Bill & Ted's Bogus Journey (1991),3.075
43,Terminator Salvation (2009),3.25
