<a href="https://colab.research.google.com/github/niklashaffert/LandUseClassification/blob/main/01_data_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

This notebook is used to download the data for the project from kaggle, then create a pandas data frame for each of the training, validation and test data and save it in drive.

# Settings

## Libraries

In [6]:
from google.colab import drive
from kagglehub import dataset_download
from numpy import array
from os import path
from pandas import read_csv, DataFrame
from pickle import dump
from PIL import Image

## Drive Settings

In [7]:
drive.mount('/content/drive')
drive_path = '/content/drive/MyDrive/LandUseClassification'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Load Data from kaggle

In [8]:
path_datasets = dataset_download('apollo2506/landuse-scene-classification')

# Generate Data Frames with Images

In [9]:
@staticmethod
def load_data(name_csv):
    data_csv = read_csv(path.join(path_datasets, name_csv))
    # Create a list to store the data for the DataFrame
    data = []
    success_count = 0
    failure_count = 0

    # Set wanted size of output image
    fixed_size = (224, 224)  # Match ResNet-18 input size

    # Iterate through the data
    for i in range(len(data_csv)):
        image_path = path.join(path_datasets, 'images/', data_csv.iloc[i, 1])
        try:
            img = Image.open(image_path)
            img = img.resize(fixed_size)
            data.append({
                'Image': img,
                'Filename': data_csv.iloc[i, 1],
                'Label': data_csv.iloc[i, 2],
                'ClassName': data_csv.iloc[i, 3],
                'ImageArray': array(img).flatten()
            })
            success_count += 1
        except Exception as e:
            failure_count += 1
            print(f"Failed to load image {image_path}: {e}")

    print(f"Successfully loaded {success_count} images into DataFrame.")
    print(f"Failed to load {failure_count} images.")

    # Create the DataFrame
    df = DataFrame(data)
    return df

In [10]:
df_train = load_data('train.csv')
df_validation = load_data('validation.csv')
df_test = load_data('test.csv')

Successfully loaded 7350 images into DataFrame.
Failed to load 0 images.
Successfully loaded 2100 images into DataFrame.
Failed to load 0 images.
Successfully loaded 1050 images into DataFrame.
Failed to load 0 images.


# Store Data in Drive

In [11]:
@staticmethod
def store_drive(object, path):
  with open(path, 'wb') as f:
      dump(object, f)

In [12]:
store_drive(df_train, path.join(drive_path, 'Data/df_train.pkl'))
store_drive(df_validation, path.join(drive_path, 'Data/df_train.pkl'))
store_drive(df_test, path.join(drive_path, 'Data/df_train.pkl'))