In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime, timedelta
from scipy.sparse import coo_matrix, csr_matrix

plt.style.use("ggplot")

In [2]:
def process_file(filename):
    # Read data from the file and split it into lines
    with open(filename, 'r') as file:
        lines = file.read().splitlines()

    # Initialize lists to store the extracted information
    user_ids = []
    item_ids = []
    ratings = []
    days = []
    times = []

    # Process the lines to extract the required information
    for line in lines:
        if '|' in line:
            # Extract UserId from the first line for each user
            user_id, _ = line.split('|')
        else:
            # Extract ItemId, Rating, and day and time
            item_id, rating, day, time = line.split('\t')
            user_ids.append(user_id)
            item_ids.append(item_id)
            ratings.append(rating)
            days.append(day)
            times.append(time)

    # Create dict
    data = {
        'userId': user_ids,
        'itemId': item_ids,
        'rating': ratings,
        'day': days,
        'time': times
    }

    return pd.DataFrame(data)

In [3]:
# Process each file and concatenate the DataFrames
train_df = process_file(os.path.join("..", "..", "data", "yahoo", "trainIdx1.txt"))
validation_df = process_file(os.path.join("..", "..", "data", "yahoo", "validationIdx1.txt"))
test_df = process_file(os.path.join("..", "..", "data", "yahoo", "testIdx1.txt"))

# Combine all DataFrames into a single DataFrame
ratings = pd.concat([train_df, validation_df, test_df], ignore_index=True)

In [None]:
# Convert day to integer and make the min number of days as 0
ratings['day'] = ratings['day'].astype(int)
ratings['day'] = ratings['day'] - ratings.day.min()

In [None]:
# Convert the 'day' column to timedelta
ratings['day'] = pd.to_timedelta(ratings['day'], unit='D')

# Convert the 'timestamp' column to timedelta
ratings['time'] = pd.to_timedelta(ratings['time'])

# Add the 'day' and 'timestamp' columns to get the 'date' column
# 1999-11-11 is the launch date of Yahoo!Music
ratings['timestamp'] = pd.to_datetime('1999-11-11') + ratings['day'] + ratings['time']
ratings["timestamp"] = ratings["timestamp"].astype(int)
ratings["rating"] = ratings["rating"].astype(float)

# Drop the individual 'day' and 'timestamp' columns if needed
ratings.drop(['day', 'time'], axis=1, inplace=True)

In [None]:
# Save preprocessed dataset to csv file
ratings.to_parquet(os.path.join("..", "..", "data", "yahoo", "ratings.parquet"), index=False)