In [34]:

import pandas as pd
import numpy as np
from datasets import Dataset, Image as dsImage, Value as scalar

df=pd.read_feather("./data/stage3.feather")
df

Unnamed: 0,path,volcano_certainty,day_night,has_fume,is_explosion,predicted,certainty
0,images\2006\jun\p0622063.jpg,10,5,4,0,UNK,0.998998
1,images\2023\jul\p0704235.jpg,0,5,4,0,UNK,0.998997
2,images\2013\sep\p0926132.jpg,0,5,4,0,UNK,0.998995
3,images\2019\ene\p0120193.jpg,10,0,0,0,UNK,0.998994
4,images\2018\jul\p0723181.jpg,10,10,4,0,UNK,0.998969
...,...,...,...,...,...,...,...
19345,images\2015\may\p0530155.jpg,0,10,4,4,EXP,0.489222
19346,images\2016\oct\p1028164.jpg,0,10,4,4,EXP,0.461512
19347,images\2018\dic\p1231184.jpg,0,10,4,4,EXP,0.459877
19348,images\2018\jun\p0603184.jpg,0,10,4,4,EXP,0.421862


In [35]:
df.drop(df[(df["predicted"]=="UNK") & (df["certainty"]==0)].index, inplace=True) #cannot load these images.

In [36]:
df.drop(df[df["path"].str.contains(".ipynb")].index,inplace=True) #drop cached images from jupyter

In [37]:
df.drop(df[df["volcano_certainty"]<5].index, inplace=True) #drop those where there is no volcano

In [38]:
df.drop(columns=["predicted","certainty","volcano_certainty"], inplace=True) #drop the non-predicted columns

In [39]:
def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

In [40]:
for col in ["day_night","has_fume","is_explosion"]:
    df[col]=norm(df[col])

In [41]:
df.reset_index(drop=True, inplace=True)

In [42]:
df.head()

Unnamed: 0,path,day_night,has_fume,is_explosion
0,images\2006\jun\p0622063.jpg,0.5,0.4,0.0
1,images\2019\ene\p0120193.jpg,0.0,0.0,0.0
2,images\2018\jul\p0723181.jpg,1.0,0.4,0.0
3,images\2015\mar\p0329153.jpg,1.0,0.4,0.0
4,images\2003\oct\p1021031.jpg,1.0,0.4,0.0


In [43]:
df.shape

(18908, 4)

In [44]:
df.to_feather("./data/stage4.feather")

In [45]:
def gen():
    for i, row in df.iterrows():
        yield {
            "pixel_values":row["path"].replace("\\","/"), #linux compatibility with / paths.
            "light_level":row["day_night"],
            "fume_strength":row["has_fume"],
            "explosion_strength":row["is_explosion"],
        }

In [46]:
dataset=Dataset.from_generator(gen)
dataset=dataset.cast_column("pixel_values", dsImage())
dataset=dataset.cast_column("light_level", scalar("float32"))
dataset=dataset.cast_column("fume_strength", scalar("float32"))
dataset=dataset.cast_column("explosion_strength", scalar("float32"))
dataset=dataset.train_test_split(test_size=.1, seed=36918)

In [47]:
dataset["train"] = dataset["train"].shuffle(seed=42).select(range(1000))
dataset["test"] = dataset["test"].shuffle(seed=42).select(range(1000))

In [None]:
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
import torchvision.transforms as transforms
transform = transforms.Compose([transforms.PILToTensor()])

def totensor(ex):
    ex["pixel_values"]=transform(ex["pixel_values"])
    return ex
dataset["train"]=dataset["train"].map(totensor, num_proc=1)
dataset["test"]=dataset["test"].map(totensor)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [32]:
dataset["train"][0]

{'pixel_values': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=704x480 at 0x7FB0525861A0>,
 'light_level': 1.0,
 'fume_strength': 1.0,
 'explosion_strength': 1.0}

In [33]:
dataset.save_to_disk("data/dataset/",max_shard_size="80MB",num_proc=12)

Saving the dataset (0/12 shards):   0%|          | 0/17017 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/1891 [00:00<?, ? examples/s]