In [7]:
import os
import typing as T
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np

DATA_PATH = Path().cwd().parent / "data"
assert DATA_PATH.exists(), "Data directory doesn't exist or isn't found from this file."

In [2]:
ratings = pd.read_csv(DATA_PATH / "ml-latest-small" / "ratings.csv")
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [26]:
ratings.isnull().values.any(), tags.isnull().values.any()

(False, False)

In [4]:
tags = pd.read_csv(DATA_PATH / "ml-latest-small" / "tags.csv")
tags["movieId"] = tags["movieId"].astype(int)
tags["timestamp"] = pd.to_datetime(tags["timestamp"], unit="s")
tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,2015-10-24 19:29:54
1,2,60756,Highly quotable,2015-10-24 19:29:56
2,2,60756,will ferrell,2015-10-24 19:29:52
3,2,89774,Boxing story,2015-10-24 19:33:27
4,2,89774,MMA,2015-10-24 19:33:20


In [23]:
ratings[ratings[["userId", "movieId"]].duplicated()]

Unnamed: 0,userId,movieId,rating,timestamp


In [25]:
tags[tags[["userId", "movieId"]].duplicated()]

Unnamed: 0,userId,movieId,tag,timestamp


In [18]:
tags[tags.index.isin(tags[["userId", "movieId"]].duplicated())]

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,2015-10-24 19:29:54
1,2,60756,Highly quotable,2015-10-24 19:29:56


In [12]:
ratings.userId.nunique()

610

In [6]:
tags.isnull().values.any()

False

In [9]:
np.where(ratings.applymap(lambda x: x == ""))

(array([], dtype=int64), array([], dtype=int64))

In [50]:
tags["timestamp"].max(), tags["timestamp"].min(), tags["timestamp"].mean()

(Timestamp('2018-09-16 11:50:03'),
 Timestamp('2006-01-13 19:09:12'),
 Timestamp('2011-10-31 03:32:46.823784960'))

In [44]:
tags.describe()

Unnamed: 0,userId,movieId
count,3683.0,3683.0
mean,431.149335,27252.013576
std,158.472553,43490.558803
min,2.0,1.0
25%,424.0,1262.5
50%,474.0,4454.0
75%,477.0,39263.0
max,610.0,193565.0


In [36]:
movies = pd.read_csv(DATA_PATH / "ml-latest-small" / "movies.csv")
movies.sample(5)

Unnamed: 0,movieId,title,genres
6739,59220,Outsourced (2006),Comedy|Romance
8203,103366,Redemption (Hummingbird) (2013),Action|Crime|Thriller
6487,53123,Once (2006),Drama|Musical|Romance
3592,4926,Everybody's Famous! (Iedereen beroemd!) (2000),Comedy|Drama|Musical
6009,37739,"Greatest Game Ever Played, The (2005)",Drama


In [39]:
def get_all_genres(genres: list[str]) -> list[str]:
    """
    Given a list of strings containing genre's, 
    """
    all_genres = set()
    for entry in genres:
        if isinstance(entry, str):
            all_genres.update(entry.split("|"))
    
    return all_genres

all_genres = get_all_genres(movies["genres"].tolist())
print(all_genres)

In [34]:
df = df_ratings.merge(df_tags, how="left", on=["userId", "movieId"])

In [35]:
df.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp_x,tag,timestamp_y
63308,414,1476,4.0,961436216,,
28208,188,1091,3.0,962559903,,
26361,181,47,3.0,845469500,,
40113,274,2791,4.0,1171491716,,
61688,401,2291,3.5,1514347443,,


In [33]:
df.sample(5)

Unnamed: 0,userId_x,movieId,rating,timestamp_x,userId_y,tag,timestamp_y
178336,387,1732,3.5,1117415219,599.0,sarcasm,1498456000.0
32207,68,924,1.5,1158532482,599.0,music,1498457000.0
86077,198,1089,5.0,1034135243,477.0,religion,1242495000.0
67686,152,750,5.0,1450572506,567.0,dark comedy,1525288000.0
2492,6,986,4.0,845556475,474.0,Animal movie,1137182000.0


In [23]:
df_tags.head(5)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [None]:
df = 