# Imports

In [1]:
import pandas as pd
import matplotlib.colors
import os
from glob import glob
from datetime import datetime

# Files

In [2]:
src_path = os.path.join("/", "Volumes", "owen", "datas", "rplace")
src_glob = os.path.join(src_path, "2022_place_canvas_history-*.csv")
dest_path = os.path.join(src_path)

In [3]:
files = glob(src_glob)
files = sorted(files, key=lambda f: f[-6:-4])
files

['/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000000.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000001.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000002.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000003.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000004.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000005.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000006.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000007.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000008.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000009.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000010.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000011.csv',
 '/Volumes/owen/datas/rplace/2022_place_canvas_history-000000000012.csv',
 '/Volumes/owen/datas/rplace/2022_plac

# Data processing

In [4]:
def open_dataset(file: str):
    def date_parser(d: str):
        if "." in d:
            return datetime.strptime(d, "%Y-%m-%d %H:%M:%S.%f %Z")
        return datetime.strptime(d, "%Y-%m-%d %H:%M:%S %Z")

    df = pd.read_csv(
        file,
        parse_dates=["timestamp"],
        date_parser=date_parser
    )
    
    df = df.rename({"timestamp": "date"}, axis=1)
    df["timestamp"] = df["date"].apply(lambda x: x.timestamp())
    df = df.sort_values(by="timestamp", ascending=False).reset_index(drop=True)
    df["rgb"] = df["pixel_color"].apply(lambda x: matplotlib.colors.to_rgb(x))

    return df

In [5]:
def merge_datasets(files: str, to_csv_all=False, to_csv_step=False, to_csv_each=False):
    df_main = pd.DataFrame()

    for i in range(len(files)):
        print(f"Adding {files[i]} ({i / len(files) * 100}%)")
        
        filename = os.path.basename(files[i])
        df = open_dataset(files[i])

        if to_csv_all or to_csv_step:
            df_main = df_main.append(df)
            df_main = df_main.sort_values(by="timestamp", ascending=False).reset_index(drop=True)

        if to_csv_step:
            path = os.path.join(dest_path, "steps", filename)
            print(f"Saving step: {path}")
            df_main.to_csv(path)

        if to_csv_each:
            path = os.path.join(dest_path, "processed", filename)
            print(f"Saving processed: {path}")
            df.to_csv(path)

    df_main = df_main.sort_values(by="timestamp", ascending=False).reset_index(drop=True)

    if to_csv_all:
        path = os.path.join(dest_path, "all.csv")
        print(f"Saving all: {path}")
        df_main.to_csv(path)

    return df_main

In [6]:
df = merge_datasets(files[57:], False, False, True)
df

Adding /Volumes/owen/datas/rplace/2022_place_canvas_history-000000000057.csv (0.0%)
Saving processed: /Volumes/owen/datas/rplace/processed/2022_place_canvas_history-000000000057.csv
Adding /Volumes/owen/datas/rplace/2022_place_canvas_history-000000000058.csv (4.545454545454546%)
Saving processed: /Volumes/owen/datas/rplace/processed/2022_place_canvas_history-000000000058.csv
Adding /Volumes/owen/datas/rplace/2022_place_canvas_history-000000000059.csv (9.090909090909092%)
Saving processed: /Volumes/owen/datas/rplace/processed/2022_place_canvas_history-000000000059.csv
Adding /Volumes/owen/datas/rplace/2022_place_canvas_history-000000000060.csv (13.636363636363635%)
Saving processed: /Volumes/owen/datas/rplace/processed/2022_place_canvas_history-000000000060.csv
Adding /Volumes/owen/datas/rplace/2022_place_canvas_history-000000000061.csv (18.181818181818183%)
Saving processed: /Volumes/owen/datas/rplace/processed/2022_place_canvas_history-000000000061.csv
Adding /Volumes/owen/datas/rplac

KeyError: 'timestamp'