In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
os.chdir('MovieLens-1m')
movie = pd.read_csv("movies1m.csv")
movie["m_id"] =  movie["id"].factorize()[0]

In [None]:
rating = pd.read_csv("ratings1m.csv")
rating = pd.merge(rating, movie, left_on='movie', right_on='id', how='left')
rating["u_id"] = rating["user"].factorize()[0]
rating['score'] = pd.Categorical(rating['score'], ordered=True)
rating['time'] = pd.to_datetime(rating['timestamp'], unit='s', origin='1970-01-01', utc=True)
rating = rating[['u_id', 'm_id', 'score', 'time']]

In [None]:
from collections import deque
rating.sort_values("time", inplace=True)

In [None]:
# 30 most recent reviews for each movie
MOVIE_WINDOW_LEN = 30
movie_y = [deque(maxlen=MOVIE_WINDOW_LEN) for _ in range(len(movie))]

users = list(rating["u_id"].unique())
# Length of current positive review streak for each user
user_pos_len = pd.Series(0, index=users)
user_neg_len = pd.Series(0, index=users)

rating_var = []

for _, rate in rating.iterrows():
    u = rate["u_id"]
    pu = user_pos_len[u]
    nu = user_neg_len[u]
    
    m = rate["m_id"]
    ym = movie_y[m]
    tm = len(ym)
    pm = sum(ym)
    nm = tm - pm
    
    rating_var.append((pu, nu, pm, nm))
    
    y = rate["score"] >= 4.0
    if y:
        user_pos_len[u] += 1
        user_neg_len[u] = 0
    else:
        user_pos_len[u] = 0
        user_neg_len[u] += 1
    ym.append(y)

In [None]:
rating_features = pd.DataFrame(rating_var, columns=["user_pos_len", "user_neg_len", "movie_pos", "movie_neg"], index = rating.index)
rating_features.sort_index(inplace = True)
os.chdir('..')
rating_features.to_csv("rating-feature.csv")