In [None]:
import os 
import pandas as pd
import numpy as np
import rasterio
from rasterio.plot import show
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore', category=rasterio.errors.NotGeoreferencedWarning)

def create_df_from_tif(path): 
    data = []
    for class_folder in os.listdir(path): 
        class_path = os.path.join(path, class_folder)
        if os.path.isdir(class_path):  # Check if it's a directory
            for filename in os.listdir(class_path): 
                if filename.endswith('.tif'):
                    file_path = os.path.join(class_path, filename)

                    with rasterio.open(file_path) as src: 
                        image = src.read()

                        data.append({
                            'class' : class_folder, 
                            'red_channel' : image[0].flatten(),
                            'green_channel': image[1].flatten(),
                            'blue_channel' : image[2].flatten()
                        })
    df = pd.DataFrame(data)
    
    le = LabelEncoder()
    df['class'] = le.fit_transform(df['class'])
    class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
    print(f'--- CLASS MAPPING --- \n{class_mapping}')
    
    return df, class_mapping

path = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/USTC_SmokeRS'
output_dir = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/created_data/smoke_data'

df, classes = create_df_from_tif(path)
output_file = os.path.join(output_dir, 'smoke_data.parquet')
df.to_parquet(output_file)

print(f"Data saved to: {output_dir}")

In [None]:
class_counts = df['class'].value_counts().sort_index()
class_counts_named = pd.Series(class_counts.values, index=[classes[i] for i in class_counts.index])

sns.set_style('darkgrid')
plt.rcParams.update({'font.size': 20})
plt.figure(figsize= (18, 8))
sns.barplot(x = class_counts_named.index, y = class_counts_named, estimator= np.sum)
plt.title('Class distribution', fontsize = 20, pad = 20)
plt.xlabel('Class', fontsize = 16)
plt.ylabel('Count')

image_output_dir = '/Users/pimpijnenburg/Desktop/Thesis/USTC_SmokeRS_dataset/data/generated_images'
plot_output = os.path.join(image_output_dir, 'class_distribution.png')
plt.tight_layout()
plt.savefig(plot_output)
print(f"Class distribution plot saved to: {plot_output}")

plt.show()