In [49]:
import pandas as pd

In [50]:
users = pd.read_csv(
    "./datasets/ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],)



In [51]:
ratings = pd.read_csv("./datasets/ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"])



In [52]:
movies = pd.read_csv("./datasets/ml-1m/movies.dat",
    sep="::",
    names=["movie_id", "title", "genres"])



In [53]:
users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")
users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")

movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")

ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
ratings["rating"] = ratings["rating"].apply(lambda x: float(x))

In [54]:
users

Unnamed: 0,user_id,sex,age_group,occupation,zip_code
0,user_1,F,group_1,occupation_10,48067
1,user_2,M,group_56,occupation_16,70072
2,user_3,M,group_25,occupation_15,55117
3,user_4,M,group_45,occupation_7,02460
4,user_5,M,group_25,occupation_20,55455
...,...,...,...,...,...
6035,user_6036,F,group_25,occupation_15,32603
6036,user_6037,F,group_45,occupation_1,76006
6037,user_6038,F,group_56,occupation_1,14706
6038,user_6039,F,group_45,occupation_0,01060


In [55]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )

In [56]:
genre_vectors = movies[genres].to_numpy()
genre_vectors.shape

(3883, 18)

In [57]:
genres = movies[genres]
genres.to_csv("./datasets/genres.csv", index=False, sep="|", header=False)


In [58]:
movies

Unnamed: 0,movie_id,title,genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,movie_1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,movie_2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,movie_3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,movie_4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,movie_5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,movie_3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,movie_3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,movie_3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,movie_3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
ratings

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,user_1,movie_1193,5.0,978300760
1,user_1,movie_661,3.0,978302109
2,user_1,movie_914,3.0,978301968
3,user_1,movie_3408,4.0,978300275
4,user_1,movie_2355,5.0,978824291
...,...,...,...,...
1000204,user_6040,movie_1091,1.0,956716541
1000205,user_6040,movie_1094,5.0,956704887
1000206,user_6040,movie_562,5.0,956704746
1000207,user_6040,movie_1096,4.0,956715648


In [60]:
ratings_sort = ratings.sort_values(by=["unix_timestamp"])

In [61]:
ratings_sort

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
1000138,user_6040,movie_858,4.0,956703932
1000153,user_6040,movie_2384,4.0,956703954
999873,user_6040,movie_593,5.0,956703954
1000007,user_6040,movie_1961,4.0,956703977
1000192,user_6040,movie_2019,5.0,956703977
...,...,...,...,...
825793,user_4958,movie_2399,1.0,1046454338
825438,user_4958,movie_1407,5.0,1046454443
825724,user_4958,movie_3264,4.0,1046454548
825731,user_4958,movie_2634,3.0,1046454548


In [62]:
ratings_group = ratings_sort.groupby(["user_id"])
print(ratings_group)

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fb87f169b90>


In [63]:
ratings_data = pd.DataFrame(
    data = {
        "user_id" : list(ratings_group.groups.keys()),
        "movie_ids" : list(ratings_group.movie_id.apply(list)),
        "ratings" : list(ratings_group.rating.apply(list)),
        "timestamps" : list(ratings_group.unix_timestamp.apply(list)),
    }
)

In [64]:
ratings_data

Unnamed: 0,user_id,movie_ids,ratings,timestamps
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_102...","[4.0, 4.0, 5.0, 5.0, 3.0, 5.0, 4.0, 4.0, 5.0, ...","[978300019, 978300055, 978300055, 978300055, 9..."
1,user_10,"[movie_597, movie_858, movie_743, movie_1210, ...","[4.0, 3.0, 3.0, 4.0, 4.0, 5.0, 5.0, 5.0, 3.0, ...","[978224375, 978224375, 978224375, 978224400, 9..."
2,user_100,"[movie_260, movie_1676, movie_1198, movie_541,...","[4.0, 3.0, 4.0, 3.0, 4.0, 3.0, 1.0, 1.0, 5.0, ...","[977593595, 977593595, 977593607, 977593624, 9..."
3,user_1000,"[movie_971, movie_260, movie_2990, movie_2973,...","[4.0, 5.0, 4.0, 3.0, 5.0, 5.0, 2.0, 5.0, 5.0, ...","[975040566, 975040566, 975040566, 975040629, 9..."
4,user_1001,"[movie_1198, movie_1617, movie_2885, movie_390...","[4.0, 4.0, 4.0, 2.0, 2.0, 1.0, 4.0, 5.0, 5.0, ...","[975039591, 975039702, 975039702, 975039898, 9..."
...,...,...,...,...
6035,user_995,"[movie_1894, movie_260, movie_247, movie_433, ...","[2.0, 4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 4.0, 3.0, ...","[975054785, 975054785, 975054785, 975054853, 9..."
6036,user_996,"[movie_1347, movie_2146, movie_1961, movie_274...","[4.0, 3.0, 5.0, 3.0, 5.0, 5.0, 5.0, 5.0, 4.0, ...","[975052132, 975052132, 975052195, 975052284, 9..."
6037,user_997,"[movie_1196, movie_2082, movie_3247, movie_244...","[4.0, 3.0, 3.0, 3.0, 2.0, 5.0, 5.0, 5.0, 4.0, ...","[975044235, 975044425, 975044426, 975044426, 9..."
6038,user_998,"[movie_2266, movie_1264, movie_1097, movie_164...","[3.0, 4.0, 5.0, 5.0, 4.0, 3.0, 4.0, 3.0, 4.0, ...","[975043499, 975043593, 975043593, 975043593, 9..."


In [65]:
def create_sequence(values, window_size, step_size):
    sequences = [] 
    start_index = 0 
    while True:
        end_index = start_index + window_size
        seq = values[start_index:end_index]
        if len(seq) < window_size:
            seq = values[-window_size:]
            if len(seq) == window_size:
                sequences.append(seq)
            break 
        sequences.append(seq)
        start_index += step_size
    return sequences


In [66]:
sequence_length = 4 
step_size = 2

ratings_data.movie_ids = ratings_data.movie_ids.apply(
    lambda ids: create_sequence(ids, sequence_length, step_size)
)

ratings_data.ratings = ratings_data.ratings.apply(
    lambda ids: create_sequence(ids, sequence_length, step_size)
)

del ratings_data["timestamps"]

In [67]:
ratings_data

Unnamed: 0,user_id,movie_ids,ratings
0,user_1,"[[movie_3186, movie_1721, movie_1270, movie_10...","[[4.0, 4.0, 5.0, 5.0], [5.0, 5.0, 3.0, 5.0], [..."
1,user_10,"[[movie_597, movie_858, movie_743, movie_1210]...","[[4.0, 3.0, 3.0, 4.0], [3.0, 4.0, 4.0, 5.0], [..."
2,user_100,"[[movie_260, movie_1676, movie_1198, movie_541...","[[4.0, 3.0, 4.0, 3.0], [4.0, 3.0, 4.0, 3.0], [..."
3,user_1000,"[[movie_971, movie_260, movie_2990, movie_2973...","[[4.0, 5.0, 4.0, 3.0], [4.0, 3.0, 5.0, 5.0], [..."
4,user_1001,"[[movie_1198, movie_1617, movie_2885, movie_39...","[[4.0, 4.0, 4.0, 2.0], [4.0, 2.0, 2.0, 1.0], [..."
...,...,...,...
6035,user_995,"[[movie_1894, movie_260, movie_247, movie_433]...","[[2.0, 4.0, 5.0, 3.0], [5.0, 3.0, 3.0, 4.0], [..."
6036,user_996,"[[movie_1347, movie_2146, movie_1961, movie_27...","[[4.0, 3.0, 5.0, 3.0], [5.0, 3.0, 5.0, 5.0], [..."
6037,user_997,"[[movie_1196, movie_2082, movie_3247, movie_24...","[[4.0, 3.0, 3.0, 3.0], [3.0, 3.0, 2.0, 5.0], [..."
6038,user_998,"[[movie_2266, movie_1264, movie_1097, movie_16...","[[3.0, 4.0, 5.0, 5.0], [5.0, 5.0, 4.0, 3.0], [..."


In [68]:
ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode("movie_ids", ignore_index=True)

In [69]:
ratings_data_movies

Unnamed: 0,user_id,movie_ids
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_1022]"
1,user_1,"[movie_1270, movie_1022, movie_2340, movie_1836]"
2,user_1,"[movie_2340, movie_1836, movie_3408, movie_1207]"
3,user_1,"[movie_3408, movie_1207, movie_2804, movie_260]"
4,user_1,"[movie_2804, movie_260, movie_720, movie_1193]"
...,...,...
498618,user_999,"[movie_2676, movie_2540, movie_1363, movie_765]"
498619,user_999,"[movie_1363, movie_765, movie_3565, movie_1410]"
498620,user_999,"[movie_3565, movie_1410, movie_2269, movie_2504]"
498621,user_999,"[movie_2269, movie_2504, movie_455, movie_193]"


In [70]:
ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)

In [71]:
ratings_data_rating

Unnamed: 0,ratings
0,"[4.0, 4.0, 5.0, 5.0]"
1,"[5.0, 5.0, 3.0, 5.0]"
2,"[3.0, 5.0, 4.0, 4.0]"
3,"[4.0, 4.0, 5.0, 4.0]"
4,"[5.0, 4.0, 3.0, 5.0]"
...,...
498618,"[3.0, 2.0, 3.0, 3.0]"
498619,"[3.0, 3.0, 4.0, 2.0]"
498620,"[4.0, 2.0, 3.0, 3.0]"
498621,"[3.0, 3.0, 2.0, 2.0]"


In [72]:
ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)

In [73]:
ratings_data_transformed

Unnamed: 0,user_id,movie_ids,ratings
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_1022]","[4.0, 4.0, 5.0, 5.0]"
1,user_1,"[movie_1270, movie_1022, movie_2340, movie_1836]","[5.0, 5.0, 3.0, 5.0]"
2,user_1,"[movie_2340, movie_1836, movie_3408, movie_1207]","[3.0, 5.0, 4.0, 4.0]"
3,user_1,"[movie_3408, movie_1207, movie_2804, movie_260]","[4.0, 4.0, 5.0, 4.0]"
4,user_1,"[movie_2804, movie_260, movie_720, movie_1193]","[5.0, 4.0, 3.0, 5.0]"
...,...,...,...
498618,user_999,"[movie_2676, movie_2540, movie_1363, movie_765]","[3.0, 2.0, 3.0, 3.0]"
498619,user_999,"[movie_1363, movie_765, movie_3565, movie_1410]","[3.0, 3.0, 4.0, 2.0]"
498620,user_999,"[movie_3565, movie_1410, movie_2269, movie_2504]","[4.0, 2.0, 3.0, 3.0]"
498621,user_999,"[movie_2269, movie_2504, movie_455, movie_193]","[3.0, 3.0, 2.0, 2.0]"


In [74]:
ratings_data_transformed = ratings_data_transformed.join( users.set_index("user_id"), on="user_id")

In [75]:
ratings_data_transformed

Unnamed: 0,user_id,movie_ids,ratings,sex,age_group,occupation,zip_code
0,user_1,"[movie_3186, movie_1721, movie_1270, movie_1022]","[4.0, 4.0, 5.0, 5.0]",F,group_1,occupation_10,48067
1,user_1,"[movie_1270, movie_1022, movie_2340, movie_1836]","[5.0, 5.0, 3.0, 5.0]",F,group_1,occupation_10,48067
2,user_1,"[movie_2340, movie_1836, movie_3408, movie_1207]","[3.0, 5.0, 4.0, 4.0]",F,group_1,occupation_10,48067
3,user_1,"[movie_3408, movie_1207, movie_2804, movie_260]","[4.0, 4.0, 5.0, 4.0]",F,group_1,occupation_10,48067
4,user_1,"[movie_2804, movie_260, movie_720, movie_1193]","[5.0, 4.0, 3.0, 5.0]",F,group_1,occupation_10,48067
...,...,...,...,...,...,...,...
498618,user_999,"[movie_2676, movie_2540, movie_1363, movie_765]","[3.0, 2.0, 3.0, 3.0]",M,group_25,occupation_15,62558
498619,user_999,"[movie_1363, movie_765, movie_3565, movie_1410]","[3.0, 3.0, 4.0, 2.0]",M,group_25,occupation_15,62558
498620,user_999,"[movie_3565, movie_1410, movie_2269, movie_2504]","[4.0, 2.0, 3.0, 3.0]",M,group_25,occupation_15,62558
498621,user_999,"[movie_2269, movie_2504, movie_455, movie_193]","[3.0, 3.0, 2.0, 2.0]",M,group_25,occupation_15,62558


In [76]:
ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply( lambda x: ",".join(x))

In [77]:
ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply( lambda x: ",".join([str(v) for v in x]))

In [78]:
del ratings_data_transformed['zip_code']

In [79]:
ratings_data_transformed.rename(columns={"movie_ids" : "sequence_movie_ids", "ratings": "sequence_ratings"}, inplace=True,)

In [80]:
ratings_data_transformed

Unnamed: 0,user_id,sequence_movie_ids,sequence_ratings,sex,age_group,occupation
0,user_1,"movie_3186,movie_1721,movie_1270,movie_1022","4.0,4.0,5.0,5.0",F,group_1,occupation_10
1,user_1,"movie_1270,movie_1022,movie_2340,movie_1836","5.0,5.0,3.0,5.0",F,group_1,occupation_10
2,user_1,"movie_2340,movie_1836,movie_3408,movie_1207","3.0,5.0,4.0,4.0",F,group_1,occupation_10
3,user_1,"movie_3408,movie_1207,movie_2804,movie_260","4.0,4.0,5.0,4.0",F,group_1,occupation_10
4,user_1,"movie_2804,movie_260,movie_720,movie_1193","5.0,4.0,3.0,5.0",F,group_1,occupation_10
...,...,...,...,...,...,...
498618,user_999,"movie_2676,movie_2540,movie_1363,movie_765","3.0,2.0,3.0,3.0",M,group_25,occupation_15
498619,user_999,"movie_1363,movie_765,movie_3565,movie_1410","3.0,3.0,4.0,2.0",M,group_25,occupation_15
498620,user_999,"movie_3565,movie_1410,movie_2269,movie_2504","4.0,2.0,3.0,3.0",M,group_25,occupation_15
498621,user_999,"movie_2269,movie_2504,movie_455,movie_193","3.0,3.0,2.0,2.0",M,group_25,occupation_15


In [81]:
import numpy as np

random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
train_data = ratings_data_transformed[random_selection]
test_data = ratings_data_transformed[~random_selection]

train_data.to_csv("./datasets/train_data.csv", index=False, sep="|", header=False)
test_data.to_csv("./datasets/test_data.csv", index=False, sep="|", header=False)

In [82]:
CSV_HEADER = list(ratings_data_transformed.columns)
CATEGORICAL_FEATURES_WITH_VOCABULARY = {
    "user_id" : list(users.user_id.unique()),
    "movie_id" : list(movies.movie_id.unique()),
    "sex" : list(users.sex.unique()),
    "age_group" : list(users.age_group.unique()),
    "occupation" : list(users.occupation.unique())
}
USER_FEATURES = ["sex", "age_group", "occupation"]
MOVIE_FEATURES = ["genres"]

In [83]:
metadata = {
    "CSV_HEADER" : CSV_HEADER,
    "CATEGORICAL_FEATURES_WITH_VOCABULARY" : CATEGORICAL_FEATURES_WITH_VOCABULARY,
    "USER_FEATURES" : USER_FEATURES,
    "MOVIE_FEATURES" : MOVIE_FEATURES
}

In [84]:
import json 
with open('./configs/metadata.json', 'w') as fp:
    json.dump(metadata, fp)