In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm.std import tqdm
import pandas as pd

np.set_printoptions(suppress=True)

In [None]:
def process_single_line(line, id):
    line = line.strip()

    line = list(map(int, line.split(",")))
    label = line.pop()
    line = np.array(line)

    num_observations = line.shape[0]/11

    # check if num_observations is an integer, if not, raise an error
    if num_observations % 1 != 0:
        raise ValueError("Number of observations is not an integer")

    num_observations = int(num_observations)

    # Splitting the line in groups of num_observations
    blue = line[:num_observations]/10000
    green = line[num_observations:2*num_observations]/10000
    red = line[2*num_observations:3*num_observations]/10000
    red_edge_1 = line[3*num_observations:4*num_observations]/10000
    red_edge_2 = line[4*num_observations:5*num_observations]/10000
    red_edge_3 = line[5*num_observations:6*num_observations]/10000
    nir = line[6*num_observations:7*num_observations]/10000
    red_edge_4 = line[7*num_observations:8*num_observations]/10000
    swir_1 = line[8*num_observations:9*num_observations]/10000
    swir_2 = line[9*num_observations:10*num_observations]/10000

    doy = line[10*num_observations:]

    # Creating numpy array with all the bands and doy as columns. Also, it will have a column with number 0 to num_observations-1, and a column with the id, and a column with the label
    data =  np.column_stack((np.repeat(id, num_observations), np.arange(num_observations), np.repeat(label, num_observations), blue, green, red, red_edge_1, red_edge_2, red_edge_3, nir, red_edge_4, swir_1, swir_2, doy))

    return data

In [None]:
def convert_to_df(filepath, use_value, start_id):
    with open(filepath, "r") as file:
        lines = file.readlines()

    data = np.vstack([process_single_line(line, start_id + id) for id, line in tqdm(enumerate(lines), total=len(lines))])
    data = pd.DataFrame(data, columns=["id", "time", "label", "blue", "green", "red", "red_edge_1", "red_edge_2", "red_edge_3", "nir", "red_edge_4", "swir_1", "swir_2", "doy"])

    data["id"] = data["id"].astype(int)
    data["time"] = data["time"].astype(int)
    data["doy"] = data["doy"].astype(int)
    data["label"] = data["label"].astype(int)

    data["use_bert"] = use_value

    return data

In [None]:
train_df = convert_to_df("data/California-Labeled/Train.csv", 0, 0)
val_df = convert_to_df("data/California-Labeled/Validate.csv", 1, train_df.id.max() + 1)
test_df = convert_to_df("data/California-Labeled/Test.csv", 2, val_df.id.max() + 1)

In [None]:
pd.concat([train_df, val_df, test_df], ignore_index=True).to_parquet("data/california_sits_bert_original.parquet", compression="brotli")