In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, Image as dsImage, Value as scalar, ClassLabel 

In [2]:
df=pd.read_feather("./data/stage3.feather")
df

Unnamed: 0,path,volcano_certainty,day_night,has_fume,is_explosion,predicted,certainty
0,images\2006\jun\p0622063.jpg,10,5,4,0,UNK,0.998998
1,images\2023\jul\p0704235.jpg,0,5,4,0,UNK,0.998997
2,images\2013\sep\p0926132.jpg,0,5,4,0,UNK,0.998995
3,images\2019\ene\p0120193.jpg,10,0,0,0,UNK,0.998994
4,images\2018\jul\p0723181.jpg,10,10,4,0,UNK,0.998969
...,...,...,...,...,...,...,...
19345,images\2015\may\p0530155.jpg,0,10,4,4,EXP,0.489222
19346,images\2016\oct\p1028164.jpg,0,10,4,4,EXP,0.461512
19347,images\2018\dic\p1231184.jpg,0,10,4,4,EXP,0.459877
19348,images\2018\jun\p0603184.jpg,0,10,4,4,EXP,0.421862


In [3]:
# cleanup useless data

In [4]:
df.drop(df[(df["predicted"]=="UNK") & (df["certainty"]==0)].index, inplace=True) #cannot load these images.

In [5]:
df.drop(df[df["path"].str.contains(".ipynb")].index,inplace=True) #drop cached images from jupyter

In [6]:
df.drop(df[df["volcano_certainty"]<5].index, inplace=True) #drop those where there is no volcano

In [7]:
df.drop(columns=["predicted","certainty","volcano_certainty"], inplace=True) #drop the non-predicted columns

In [8]:
df.drop_duplicates(subset="path", inplace=True)

In [9]:
df["path"]=df["path"].str.replace("\\","/")

In [10]:
#normalize predictors

In [11]:
def norm(x):
    return (x-np.min(x))/(np.max(x)-np.min(x))

In [12]:
for col in ["day_night","has_fume","is_explosion"]:
    df[col]=norm(df[col])

In [13]:
df.shape

(18908, 4)

In [14]:
(df["path"].value_counts()>1).sum()

0

In [15]:
#insert previous data for 1-1 comparison, estimate new classes as an aside

In [16]:
old_data=pd.read_feather("~/MiriTesis/Reclassified.feather")

In [17]:
old_data.drop(columns=["y","my"], inplace=True)

In [18]:
old_data.drop_duplicates(subset="Path", inplace=True)

In [19]:
joint=df.merge(old_data, left_on="path", right_on="Path", how="left").drop(columns=["Path"])

In [20]:
joint.shape[0]==df.shape[0]

True

In [21]:
joint["old_class"]=joint["ny"]
joint.drop(columns=["ny"], inplace=True)

In [22]:
def reg_to_class(x): #[b, 3]
    t=np.full(x.shape[0], "INA")
    t_f=np.full(x.shape[0], "FUM")
    t_e=np.full(x.shape[0], "EXP")
    t_ef=np.full(x.shape[0], "EXP+FUM")
    t=np.where((x[:,1]>.75) & (x[:,2]>.1),t_ef,t)
    t=np.where((x[:,1]>.75) & (x[:,2]<=.1),t_f,t)
    t=np.where((x[:,1]<=.75) & (x[:,2]>.1),t_e,t)
    
    return t

In [23]:
x=np.asarray(joint[["day_night", "has_fume","is_explosion"]])

In [24]:
joint["new_class"]=reg_to_class(x)

In [25]:
joint.describe()

Unnamed: 0,day_night,has_fume,is_explosion
count,18908.0,18908.0,18908.0
mean,0.905633,0.751296,0.370087
std,0.27197,0.378503,0.460627
min,0.0,0.0,0.0
25%,1.0,0.4,0.0
50%,1.0,1.0,0.0
75%,1.0,1.0,1.0
max,1.0,1.0,1.0


In [26]:
joint.loc[joint["old_class"].isna(),"old_class"]="UNK"

In [27]:
df=joint

In [28]:
#consolidate sorting

In [29]:
df.reset_index(drop=True, inplace=True)

In [30]:
df.head()

Unnamed: 0,path,day_night,has_fume,is_explosion,old_class,new_class
0,images/2006/jun/p0622063.jpg,0.5,0.4,0.0,UNK,INA
1,images/2019/ene/p0120193.jpg,0.0,0.0,0.0,UNK,INA
2,images/2018/jul/p0723181.jpg,1.0,0.4,0.0,UNK,INA
3,images/2015/mar/p0329153.jpg,1.0,0.4,0.0,UNK,INA
4,images/2003/oct/p1021031.jpg,1.0,0.4,0.0,UNK,INA


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18908 entries, 0 to 18907
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   path          18908 non-null  object 
 1   day_night     18908 non-null  float64
 2   has_fume      18908 non-null  float64
 3   is_explosion  18908 non-null  float64
 4   old_class     18908 non-null  object 
 5   new_class     18908 non-null  object 
dtypes: float64(3), object(3)
memory usage: 886.4+ KB


In [32]:
df.shape

(18908, 6)

In [33]:
df.to_feather("./data/stage4.feather")

In [34]:
def gen():
    for i, row in df.iterrows():
        yield {
            "light_level":row["day_night"],
            "fume_strength":row["has_fume"],
            "explosion_strength":row["is_explosion"],
            "image":row["path"],
            "old_class":row["old_class"],
            "class":row["new_class"],
        }

In [36]:
class_names=["UNK","INA","FUM","EXP","EXP+FUM"]

In [37]:
dataset=Dataset.from_generator(gen).with_format("torch")
dataset=dataset.cast_column("image", dsImage())
dataset=dataset.cast_column("light_level", scalar("float32"))
dataset=dataset.cast_column("fume_strength", scalar("float32"))
dataset=dataset.cast_column("explosion_strength", scalar("float32"))
dataset=dataset.cast_column("old_class", ClassLabel(num_classes=len(class_names), names=class_names))
dataset=dataset.cast_column("class", ClassLabel(num_classes=len(class_names), names=class_names))
dataset=dataset.train_test_split(test_size=.1, seed=36918, stratify_by_column="class")

Casting the dataset:   0%|          | 0/18908 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/18908 [00:00<?, ? examples/s]

In [38]:
dataset.save_to_disk("data/dataset/",max_shard_size="80MB",num_proc=12)

Saving the dataset (0/12 shards):   0%|          | 0/17017 [00:00<?, ? examples/s]

Saving the dataset (0/12 shards):   0%|          | 0/1891 [00:00<?, ? examples/s]

In [None]:
dataset.push_to_hub("mirluvams/popocatepetl", private=True)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/8509 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/86 [00:00<?, ?ba/s]