In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, Image as dsImage, Value as scalar, ClassLabel 
from PIL import Image
import torchvision

In [2]:
df=pd.read_feather("./data/stage3.feather")
df.drop(columns=["time"],inplace=True)
df

Unnamed: 0,image,labels
0.0,images\1999\ene\p0131991.jpg,FUMAROLA
1.0,images\1999\ene\p0131992.jpg,FUMAROLA
2.0,images\1999\ene\p0131993.jpg,FUMAROLA
3.0,images\1999\ene\p0131994.jpg,FUMAROLA
4.0,images\1999\ene\p0131995.jpg,FUMAROLA
...,...,...
20164.0,images\2024\image_96.jpg,NO_VOLCAN
20165.0,images\2024\image_97.jpg,FUMAROLA
20166.0,images\2024\image_98.jpg,FUMAROLA
20167.0,images\2024\image_99.jpg,FUMAROLA


In [3]:
df.dropna(inplace=True)

In [4]:
# cleanup useless data

In [5]:
df.drop(df[(df["labels"]=="NO_VOLCAN")].index, inplace=True) #cannot load these images/no volcano

In [6]:
df.drop(df[df["image"].str.contains(".ipynb")].index,inplace=True) #drop cached images from jupyter

In [7]:
df.drop_duplicates(subset="image", inplace=True)

In [8]:
df["image"]=df["image"].str.replace("\\","/")

In [9]:
#consolidate sorting

In [10]:
df.reset_index(drop=True, inplace=True)

In [11]:
faults=[]
for i, row in df.iterrows():
    try:
        with Image.open(row["image"]) as img:
            if(img.mode!="RGB"):
                print(f"FAULT AT {row['image']}: Invalid dimensions")
                faults.append(i)
                continue
        tensor=torchvision.io.read_image(
            row["image"],
            mode=torchvision.io.image.ImageReadMode.RGB,
        )
    except Exception as E:
        print(f"FAULT AT {row['image']}: {E}")
        faults.append(i)
        continue

FAULT AT images/2001/nov/p1105011.jpg: Unsupported image file. Only jpeg, png and gif are currently supported.
FAULT AT images/2001/nov/p1107011.jpg: Unsupported image file. Only jpeg, png and gif are currently supported.
FAULT AT images/2010/nov/p1102101.jpg: Unsupported image file. Only jpeg, png and gif are currently supported.
FAULT AT images/2011/ene/p0126112.jpg: Image is incomplete or truncated
FAULT AT images/2016/jul/p0703161.jpg: Image is incomplete or truncated
FAULT AT images/2016/sep/p0913161.jpg: Image is incomplete or truncated
FAULT AT images/2018/jun/p0602181.jpg: Image is incomplete or truncated
FAULT AT images/2018/may/p0528183.jpg: Image is incomplete or truncated
FAULT AT images/2019/jun/p0612197.png: Invalid dimensions
FAULT AT images/2019/mar/p0307195.png: Invalid dimensions
FAULT AT images/2021/dic/p1209211.png: Invalid dimensions
FAULT AT images/2021/dic/p1210211.png: Invalid dimensions
FAULT AT images/2021/nov/p1117212.png: Invalid dimensions
FAULT AT images/2

In [12]:
df.drop(faults, inplace=True)

In [13]:
df["labels"].value_counts(), df.shape[0]

(labels
 FUMAROLA    8364
 ERUPCION    6789
 INACTIVO    1144
 Name: count, dtype: int64,
 16297)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16297 entries, 0 to 16336
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   16297 non-null  object
 1   labels  16297 non-null  object
dtypes: object(2)
memory usage: 382.0+ KB


In [15]:
df.shape

(16297, 2)

In [16]:
# WEIGHTS FOR CLASSIFICATION LOSS

In [17]:
df["labels"].value_counts()

labels
FUMAROLA    8364
ERUPCION    6789
INACTIVO    1144
Name: count, dtype: int64

In [18]:
freq=df["labels"].value_counts()
freq=np.abs(df.shape[0]-freq)**3
freq=freq/freq.sum()
freq

labels
FUMAROLA    0.103190
ERUPCION    0.177661
INACTIVO    0.719150
Name: count, dtype: float64

In [19]:
df.to_feather("./data/stage4.feather")

In [20]:

def gen():
    for i, row in df.iterrows():
        yield {
            "image":row["image"],
            "labels":row["labels"],
        }

In [21]:
np.unique(df["labels"])

array(['ERUPCION', 'FUMAROLA', 'INACTIVO'], dtype=object)

In [22]:
class_names=['INACTIVO', 'FUMAROLA', 'ERUPCION']

In [23]:
dataset=Dataset.from_generator(gen).with_format("torch")
dataset=dataset.cast_column("image", dsImage())
dataset=dataset.cast_column("labels", ClassLabel(num_classes=len(class_names), names=class_names))
dataset=dataset.train_test_split(test_size=.1, seed=36918, stratify_by_column="labels")

In [24]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image', 'labels'],
        num_rows: 14667
    })
    test: Dataset({
        features: ['image', 'labels'],
        num_rows: 1630
    })
})

In [25]:
dataset.save_to_disk("data/dataset/",max_shard_size="80MB",num_proc=12)

Saving the dataset (0/12 shards):   0%|          | 0/14667 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/1630 [00:00<?, ? examples/s]

In [26]:
dataset.push_to_hub("mirluvams/popocatepetl", private=True)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/7334 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Map:   0%|          | 0/7333 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/1630 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/17 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/537 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mirluvams/popocatepetl/commit/d466d2607e8359062f08a0487bad8822360d8276', commit_message='Upload dataset', commit_description='', oid='d466d2607e8359062f08a0487bad8822360d8276', pr_url=None, pr_revision=None, pr_num=None)