In [None]:
!pip3 install rs_datasets

In [115]:
from rs_datasets import MovieLens
import pandas as pd
import re

# MovieLens data exploration

The 10 million ratings dataframe provides: ratings, users, movies, and tags dataframe 

The 1 million ratings dataframe does not include tags, therefore I will merge the two because tags will come in handy later

In [47]:
df_large = MovieLens('10m')
df_small = MovieLens('1m')

In [134]:
movies = df_small.items
users = df_small.users
ratings = df_small.ratings
tags = df_large.tags

## Movies - preprocess

In [70]:
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [76]:
# rename item_id to movie_id 
movies = movies.rename(columns={'item_id':'movie_id'})

In [90]:
# cast genres and title to string
movies = movies.astype({'title': 'string', 'genres': 'string'})

In [81]:
# separating the title and the year of the movie 

movies['year'] = movies['title']

years = movies['year']
titles = movies['title']

for i in range(len(years)): 
  year = years[i]
  year = year[-5:-1]
  years[i] = year
  title = titles[i]
  title = title[:-6]
  titles[i] = title

movies['year'] = years
movies['title'] = titles

In [88]:
movies['genres'] = movies['genres'].map(lambda x: x.lower())
movies['genres'] = movies['genres'].map(lambda x: x.replace('|', ' '))

In [89]:
movies

Unnamed: 0,movie_id,title,genres,year
0,1,Toy Story,animation children's comedy,1995
1,2,Jumanji,adventure children's fantasy,1995
2,3,Grumpier Old Men,comedy romance,1995
3,4,Waiting to Exhale,comedy drama,1995
4,5,Father of the Bride Part II,comedy,1995
...,...,...,...,...
3878,3948,Meet the Parents,comedy,2000
3879,3949,Requiem for a Dream,drama,2000
3880,3950,Tigerland,drama,2000
3881,3951,Two Family House,drama,2000


## Users - preprocess

In [91]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [92]:
# I will not be using the users' zip codes or occupations
users = users.drop(columns=['zip_code','occupation'])

In [94]:
# assign binary values to gender
users['gender'] = users['gender'].map(dict(zip(['M','F'],[0,1])))

In [96]:
users.head()

Unnamed: 0,user_id,gender,age
0,1,1,1
1,2,0,56
2,3,0,25
3,4,0,45
4,5,0,25


## Ratings - preprocess

In [97]:
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [100]:
# rename the movie id column and drop the timestamp
ratings = ratings.rename(columns={'item_id': 'movie_id'})
ratings = ratings.drop(columns=['timestamp'])

In [101]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


## Tags - preprocess

In [102]:
tags.head()

Unnamed: 0,user_id,item_id,tag,timestamp
0,15,4973,excellent!,1215184630
1,20,1747,politics,1188263867
2,20,1747,satire,1188263867
3,20,2424,chick flick 212,1188263835
4,20,2424,hanks,1188263835


In [138]:
# drop user id and timestamp
tags = tags.drop(columns = ['timestamp', 'user_id'])
tags = tags.rename(columns={'item_id': 'movie_id'})
tags = tags.astype({'tag': 'string'})
tags = tags.dropna()

In [139]:
tags.shape

(95564, 2)

In [140]:
# grouping by movie id 
tags = tags.groupby(['movie_id'], as_index = False).agg({'tag': ' '.join})
# remove punctuation 
tags.loc[:,"tag"] = tags.tag.apply(lambda x : " ".join(re.findall('[\w]+',x)))
# remove digits 
tags['tag'] = tags['tag'].replace('\d+', '', regex=True)
# lowercase all words
tags.loc[:,"tag"] = tags.tag.apply(lambda x : x.lower())

In [142]:
tags = pd.merge(tags, movies, on='movie_id', how='right')

In [143]:
tags

Unnamed: 0,movie_id,tag,title,genres,year
0,1,pixar pixar pixar animation pixar animated fun...,Toy Story,animation children's comedy,1995
1,2,for children game animals joe johnston robin w...,Jumanji,adventure children's fantasy,1995
2,3,funniest movies comedinha de velhinhos engraã ...,Grumpier Old Men,comedy romance,1995
3,4,girl movie,Waiting to Exhale,comedy drama,1995
4,5,steve martin pregnancy remake steve martin fam...,Father of the Bride Part II,comedy,1995
...,...,...,...,...,...
3878,3948,ben stiller comedy hilarious owen wilson ben s...,Meet the Parents,comedy,2000
3879,3949,ass to ass heroin psychology depressing drugs ...,Requiem for a Dream,drama,2000
3880,3950,colin farrell,Tigerland,drama,2000
3881,3951,in netflix queue in netflix queue r,Two Family House,drama,2000


In [144]:
tags.isna().sum()

movie_id       0
tag         1078
title          0
genres         0
year           0
dtype: int64

In [145]:
tags['tag+genre'] = tags['tag']

In [148]:
n = 5
for i in range(n): 
  tags['tag+genre'] += ' ' + tags['genres']

tags['tag'] = tags['tag+genre']
tags = tags.drop(columns = ['tag+genre'])

In [152]:
tags = tags.rename(columns = {'tag': 'tags'})

In [153]:
tags.head()

Unnamed: 0,movie_id,tags,title,genres,year,tag+genre
0,1,pixar pixar pixar animation pixar animated fun...,Toy Story,animation children's comedy,1995,pixar pixar pixar animation pixar animated fun...
1,2,for children game animals joe johnston robin w...,Jumanji,adventure children's fantasy,1995,for children game animals joe johnston robin w...
2,3,funniest movies comedinha de velhinhos engraã ...,Grumpier Old Men,comedy romance,1995,funniest movies comedinha de velhinhos engraã ...
3,4,girl movie comedy drama comedy drama comedy dr...,Waiting to Exhale,comedy drama,1995,girl movie comedy drama comedy drama comedy dr...
4,5,steve martin pregnancy remake steve martin fam...,Father of the Bride Part II,comedy,1995,steve martin pregnancy remake steve martin fam...


## Save

In [154]:
from google.colab import files

movies.to_csv('movies.csv')
files.download("movies.csv") 

tags.to_csv('tags.csv')
files.download("tags.csv")  

users.to_csv('users.csv')
files.download("users.csv") 

ratings.to_csv('ratings.csv')
files.download("ratings.csv") 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>