In [3]:
import os 
import pandas as pd
import numpy as np
import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore', category=rasterio.errors.NotGeoreferencedWarning)

def create_df_from_tif(path): 
    data = []
    for class_folder in os.listdir(path): 
        class_path = os.path.join(path, class_folder)
        if os.path.isdir(class_path):  # Check if it's a directory
            for filename in os.listdir(class_path): 
                if filename.endswith('.tif'):
                    file_path = os.path.join(class_path, filename)

                    with rasterio.open(file_path) as src: 
                        image = src.read()

                        data.append({
                            'class' : class_folder, 
                            'red_channel' : image[0].flatten(),
                            'green_channel': image[1].flatten(),
                            'blue_channel' : image[2].flatten()
                        })
    df = pd.DataFrame(data)
    
    le = LabelEncoder()
    df['class'] = le.fit_transform(df['class'])
    class_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f'--- CLASS MAPPING --- \n{class_mapping}')
    
    return df

path = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/USTC_SmokeRS'
output_dir = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/created_data/smoke_data'

df = create_df_from_tif(path)
output_file = os.path.join(output_dir, 'smoke_data.parquet')
df.to_parquet(output_file)

print(f"Data saved to: {output_dir}")

--- CLASS MAPPING --- 
{'Cloud': 0, 'Dust': 1, 'Haze': 2, 'Land': 3, 'Seaside': 4, 'Smoke': 5}
Data saved to: /Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/created_data/smoke_data
