# Land Cover Classification using Deep Learning Techniques and EuroSAT

## Mounting google drive for dataset access and saving models

## Libraries

In [1]:
!pip install ipython-autotime
%load_ext autotime

time: 0 ns (started: 2022-07-18 17:00:04 -07:00)


In [2]:
!pip install torchvision

time: 3.25 s (started: 2022-07-18 17:00:04 -07:00)


In [3]:
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from tqdm.notebook import tqdm
import seaborn as sns
import random


from sklearn.metrics import confusion_matrix
from sklearn import model_selection
from sklearn.model_selection import train_test_split


import torch
import torchvision.models as models
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torchvision.transforms import transforms
from torchvision.utils import make_grid
import torch.nn.functional as F

import matplotlib
matplotlib.rcParams['axes.grid'] = False

time: 3.25 s (started: 2022-07-18 17:00:07 -07:00)


## Config

In [4]:
ROOT_PATH = './'
BASE_PATH = '../data/EuroSat_RGB/2750/'
FULL_DATA_DF = os.path.join(ROOT_PATH, 'dataset_label.csv')

IDX_CLASS_LABELS = {
    0: 'AnnualCrop',
    1: 'Forest', 
    2: 'HerbaceousVegetation',
    3: 'Highway',
    4: 'Industrial',
    5: 'Pasture',
    6: 'PermanentCrop',
    7: 'Residential',
    8: 'River',
    9: 'SeaLake'
}
CLASSES = ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture','PermanentCrop','Residential','River', 'SeaLake']
CLASS_IDX_LABELS = dict()
for key, val in IDX_CLASS_LABELS.items():
  CLASS_IDX_LABELS[val] = key

NUM_CLASSES = len(IDX_CLASS_LABELS.items())
torch.manual_seed(1021)
TEST_SIZE = 0.25

approach = "Resnet"

time: 0 ns (started: 2022-07-18 17:00:11 -07:00)


## Utils

In [5]:
## Give idx of each class name
def encode_label(label):
    idx = CLASS_IDX_LABELS[label] 
    return idx

## Take in idx and return the class name
def decode_target(target, text_labels=True):
    result = []
    if text_labels:
        return IDX_CLASS_LABELS[target]
    else:
        return target

## Show batches of images
def show_batch(dl):
    for images, labels in dl:
        fig, ax = plt.subplots(figsize=(16, 8))
        ax.set_xticks([]); ax.set_yticks([])
        ax.imshow(make_grid(images, nrow=16).permute(1, 2, 0))
        break

time: 0 ns (started: 2022-07-18 17:00:11 -07:00)


In [6]:
## Example for decoding and encoding
print(encode_label('Forest'))
print(decode_target(1))
print(decode_target(2, text_labels=True))

1
Forest
HerbaceousVegetation
time: 0 ns (started: 2022-07-18 17:00:11 -07:00)


## Pre-Processing data folders to train and test
Run this code cells only if you don't have the required dataframe

In [7]:
print(f"Total number of images in the dataset: {os.listdir(BASE_PATH)}")

Total number of images in the dataset: ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 'River', 'SeaLake']
time: 16 ms (started: 2022-07-18 17:00:11 -07:00)


In [8]:
def read_data(data_directory = "data/",label_filter=[]):
    """
    Reads the data from the files, creates labels by file name

    Parameters: 
        data_dir (string): The name of a directory in which image data is located
        label_filters (list or array): Data labels to load, should match sub-folder names
                                     - default load all subdirectories
        
    Output: 
        Tupel of lists (images, labels) where the former (images) is a list of matrix representations of the 
        scaled image data and the latter (labels) is a list of strings of the corresponding label of each 
        entry in the former (images) based on the sub-folder the image was loaded from. 
    """
    images = []
    labels = []
    loaded = {}
    
    df = pd.DataFrame(columns = ['image_id', 'label'])
    print(df)
    
    filter_labels = ( len(label_filter) >0) # if we didn't pass in anything in the filter read everything

    #Crawl the data directory files and sub-directories...
        
    print(data_directory)
    for dir_name, sub_dir_list, file_list in os.walk(data_directory):

            #Ignore anything in the actual directory itself (all images are in the sub-directories).
            if dir_name == data_directory:
                continue

            #The label will match the sub-folder name (root data folder ignored above.)
            label = os.path.basename(dir_name)
            
            if not filter_labels and not label in label_filter:
                label_filter.append(label)
            
            if label in label_filter: # don't process directory if not in filter
                  
                print(f'Processing label: {label}')

                for file_name in file_list:


                    #Add label to loaded file dictinoary if it's not there already
                    #(To keep track of how may files of this label have been loaded.)
                    if label not in loaded: 
                         loaded[label] = 0

                    #Start actual loading process.
                    file_path = os.path.join(dir_name, file_name)

                    df =df.append({'image_id': file_path, 'label' : label},ignore_index=True)
                    # open in with structure to avoid memory leaks
                    #with Image.open(file_path) as f:
                        # copy impage into np array
                     #   image = np.array(f)

                        # append to output
                      #  images.append(image)

                        # set label to be the index of the label string in the label_filter list
                       # label_index = label_filter.index(label)
                        #labels.append(label_index)

                    loaded[label] += 1 

    #Print a summary of what's been loaded by label.
    for label in loaded: 
        print('%s: %s' % (label, loaded[label]))
    print('Total: %d' % sum(list(loaded.values())))

    return(df)

from os import walk

i = 0
DATA_DF = read_data(BASE_PATH)

Empty DataFrame
Columns: [image_id, label]
Index: []
../data/EuroSat_RGB/2750/
Processing label: AnnualCrop
Processing label: Forest
Processing label: HerbaceousVegetation
Processing label: Highway
Processing label: Industrial
Processing label: Pasture
Processing label: PermanentCrop
Processing label: Residential
Processing label: River
Processing label: SeaLake
AnnualCrop: 3000
Forest: 3000
HerbaceousVegetation: 3000
Highway: 2500
Industrial: 2500
Pasture: 2000
PermanentCrop: 2500
Residential: 3000
River: 2500
SeaLake: 3000
Total: 27000
time: 35.8 s (started: 2022-07-18 17:00:11 -07:00)


In [9]:
# for (dirpath, dirname, filename) in walk(BASE_PATH):
#   for each_file in filename:

#for each_file in os.listdir(BASE_PATH):
#    DATA_DF.loc[i] = [each_file, each_file.split('_')[0]]
#    i += 1
#print(i)

print(FULL_DATA_DF)
DATA_DF.to_csv(FULL_DATA_DF, index=False)
DATA_DF.head()

./dataset_label.csv


Unnamed: 0,image_id,label
0,../data/EuroSat_RGB/2750/AnnualCrop\AnnualCrop...,AnnualCrop
1,../data/EuroSat_RGB/2750/AnnualCrop\AnnualCrop...,AnnualCrop
2,../data/EuroSat_RGB/2750/AnnualCrop\AnnualCrop...,AnnualCrop
3,../data/EuroSat_RGB/2750/AnnualCrop\AnnualCrop...,AnnualCrop
4,../data/EuroSat_RGB/2750/AnnualCrop\AnnualCrop...,AnnualCrop


time: 62 ms (started: 2022-07-18 17:00:47 -07:00)


In [10]:
print(f"Uniques values for labels are...")
for ind, each in enumerate(DATA_DF.label.unique()):
  print(ind+1, ":", each)

Uniques values for labels are...
1 : AnnualCrop
2 : Forest
3 : HerbaceousVegetation
4 : Highway
5 : Industrial
6 : Pasture
7 : PermanentCrop
8 : Residential
9 : River
10 : SeaLake
time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


## Processing data folders to train and test


In [11]:
DATA_DF = pd.read_csv(FULL_DATA_DF)
print(DATA_DF.shape)
DATA_DF = DATA_DF.sample(frac = 1, random_state=48) 
TRAIN_DF = DATA_DF[:-int(len(DATA_DF)*TEST_SIZE)]
TEST_DF = DATA_DF[-int(len(DATA_DF)*TEST_SIZE) :]

(27000, 2)
time: 46 ms (started: 2022-07-18 17:00:47 -07:00)


In [12]:
TRAIN_DF.reset_index(inplace = True) 
TRAIN_DF.head()

Unnamed: 0,index,image_id,label
0,19127,../data/EuroSat_RGB/2750/Residential\Residenti...,Residential
1,3197,../data/EuroSat_RGB/2750/Forest\Forest_1176.jpg,Forest
2,24557,../data/EuroSat_RGB/2750/SeaLake\SeaLake_150.jpg,SeaLake
3,17042,../data/EuroSat_RGB/2750/PermanentCrop\Permane...,PermanentCrop
4,3697,../data/EuroSat_RGB/2750/Forest\Forest_1626.jpg,Forest


time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


In [13]:
TEST_DF.reset_index(inplace = True) 
TEST_DF.head()

Unnamed: 0,index,image_id,label
0,6839,../data/EuroSat_RGB/2750/HerbaceousVegetation\...,HerbaceousVegetation
1,14751,../data/EuroSat_RGB/2750/Pasture\Pasture_1675.jpg,Pasture
2,8651,../data/EuroSat_RGB/2750/HerbaceousVegetation\...,HerbaceousVegetation
3,17692,../data/EuroSat_RGB/2750/PermanentCrop\Permane...,PermanentCrop
4,19476,../data/EuroSat_RGB/2750/Residential\Residenti...,Residential


time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


In [14]:
len(TRAIN_DF.index), len(TEST_DF.index)

(20250, 6750)

time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


## Creating Dataset and Dataloaders

Now we will Create custom dataset by extending pytorch Dataset class. We also add provision for adding transformers for data augmentation.

In [None]:
class EuroSAT(Dataset):
    def __init__(self, df, dirc, transform=None):
        self.train_dir = dirc
        self.train_df = df
        self.transform = transform
        
    def __len__(self):
        return len(self.train_df)
    
    def __getitem__(self, idx):
        row = self.train_df.loc[idx]
        img_id, label = row['image_id'], row['label']
        img = Image.open(img_id)
        if self.transform:
            img = self.transform(img)
        return img, encode_label(label)

### Transformations and Datasets

In [16]:
## Dataset and transformations
train_data_transform = transforms.Compose([
                                transforms.Resize(size=(224, 224)),
#                                 transforms.RandomHorizontalFlip(p=0.5),
#                                 transforms.RandomRotation(20),
#                                 transforms.RandomVerticalFlip(p=0.5),
                                transforms.ToTensor(),
                                ])

test_data_transform = transforms.Compose([
                                transforms.Resize(size=(224, 224)),
                                transforms.ToTensor(),
                                ])

time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


In [17]:
train_ds = EuroSAT(TRAIN_DF, BASE_PATH, train_data_transform)
test_ds = EuroSAT(TEST_DF, BASE_PATH, test_data_transform)
print(len(train_ds), len(test_ds))

20250 6750
time: 0 ns (started: 2022-07-18 17:00:47 -07:00)


In [None]:
## Data loaders and showing batch of data
batch_size = 64
train_dl = DataLoader(train_ds, batch_size, shuffle=True, num_workers=2, pin_memory=True)
test_dl = DataLoader(test_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)
show_batch(train_dl)

## Model

In [None]:
def accuracy(outputs, labels):
    _, preds = torch.max(outputs, dim = 1)
    return torch.tensor(torch.sum(preds==labels).item() / len(preds))

class MulticlassClassifierBase(nn.Module):
    def training_step(self, batch):
        img, label = batch
        out = self(img)
        loss = criterion(out, label)
        accu = accuracy(out, label)
        return accu ,loss

    def validation_step(self, batch):
        img, label = batch
        out = self(img)
        loss = criterion(out, label)
        accu = accuracy(out, label)
        return {"val_loss": loss.detach(), "val_acc": accu}
    
    def validation_epoch_ends(self, outputs):
        batch_loss = [x['val_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_loss).mean()
        batch_acc = [x['val_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_acc).mean()
        return {"val_loss":epoch_loss.item(), "val_acc":epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}],train_accu: {:.4f}, learning_rate: {:.4f}, train_loss: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}".format(
            epoch,result['train_accu'], result['lrs'][-1], result['train_loss'], result['val_loss'], result['val_acc']))

In [None]:
class LULC_Model(MulticlassClassifierBase):
    def __init__(self):
        super().__init__()
        self.network = models.wide_resnet50_2(pretrained=True)
        n_inputs = self.network.fc.in_features
        self.network.fc = nn.Sequential(
                              nn.Linear(n_inputs, 256),
                              nn.ReLU(),
                              nn.Dropout(0.5),
                              nn.Linear(256, NUM_CLASSES),
                              nn.LogSoftmax(dim=1)
                                )
    def forward(self, xb):
        return self.network(xb)
    
    def freeze(self):
        for param in self.network.parameters():
            param.require_grad=False
        for param in self.network.fc.parameters():
            param.require_grad=True
    def unfreeze(self):
        for param in self.network.parameters():
            param.require_grad=True

In [None]:
model = LULC_Model()

In [None]:
## Running through the data loader => Model => Output
def try_batch(dl):
    for images, labels in dl:  
        print(images.shape)
        out = model(images)
        print(out.shape)
        print(out[0])
        break
        
try_batch(train_dl)

## Training and Evaluating

In [None]:
@torch.no_grad()
def evaluate(model, valid_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in valid_loader]
    return model.validation_epoch_ends(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']
    
def fit(epochs, max_lr,  model, train_loader, valid_loader, weight_decay=0,\
                grad_clip=None,opt_func=torch.optim.SGD, max_epochs_stop=3):
  
    history = []
    valid_loss_min = np.Inf
    valid_acc_max = 0
    model_file_name = 'lulc.pth'
    model_file_name2 = 'lulc_max_acc.pth'
    epochs_no_improve =  0
    optimizer = opt_func(model.parameters(), lr=max_lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=2, factor=0.1)
                         
    for epoch in range(epochs):
        model.train()
        train_loss = []
        train_accu = []
        lrs = []
        for batch in tqdm(train_loader):
            
            accu, loss = model.training_step(batch)
            train_loss.append(loss)
            train_accu.append(accu)
            loss.backward()
            ## Gradient Clipping
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()
            
            lrs.append(get_lr(optimizer))
            
            
            
        result = evaluate(model, valid_loader)
        scheduler.step(result['val_loss'])
        ########### Early Stopping ##############                                         
        valid_loss = result['val_loss']
        valid_acc = result['val_acc']
        if valid_acc > valid_acc_max:
            torch.save(model.state_dict(), model_file_name2)
            valid_acc_max = valid_acc
        if valid_loss<valid_loss_min:
            torch.save(model.state_dict(), model_file_name)
            valid_loss_min = valid_loss                                  
            epochs_no_improve = 0          
        else:
            epochs_no_improve += 1
            if epochs_no_improve > max_epochs_stop:
                result["train_loss"] = torch.stack(train_loss).mean().item()
                result["train_accu"] = torch.stack(train_accu).mean().item()
                result["lrs"] = lrs
                model.epoch_end(epoch, result)
                history.append(result)
                print("Early Stopping............................")
                return history                                
                                                 
        result["train_loss"] = torch.stack(train_loss).mean().item()
        result["train_accu"] = torch.stack(train_accu).mean().item()
        result["lrs"] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    print("VAL LOSS MIN {}".format(valid_loss_min))
    print("VAL ACC MAX {}".format(valid_acc_max))
    return history

## Initializing Device also Loading Data and Model to device¶


In [None]:
def get_device():
    print(torch.cuda.is_available())
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
        
def to_device(data, device):
    if isinstance(data, (list, tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        for b in self.dl:
            yield to_device(b, self.device)
            
    def __len__(self):
        return len(self.dl)

In [None]:
device = get_device()
## Loading data to devide
train_dl = DeviceDataLoader(train_dl, device)
test_dl = DeviceDataLoader(test_dl, device)
## Loading model to device
model = to_device(LULC_Model(), device)
## lets try passing a batch to model again
try_batch(train_dl)

In [None]:
torch.cuda.empty_cache()

## Training

In [None]:
## Hyper Parameters
max_epochs_stop = 5
max_lr = 1e-4
grad_clip = 0.1
weight_decay = 0
batch_size = 64
criterion = nn.CrossEntropyLoss()
epochs = 25
opt_func = torch.optim.Adam
## Evaluating with non-trained model
evaluate(model, test_dl)

In [None]:
## Freezing except last layer
model.freeze()
## Training
history = fit(epochs, max_lr, model, train_dl, test_dl, weight_decay, grad_clip, opt_func, max_epochs_stop)

## Reports

### Training Reports

In [None]:
def plot_losses(history):
    train_losses = [x.get('train_loss') for x in history]
    val_losses = [x['val_loss'] for x in history]
    plt.plot(train_losses, '-bx')
    plt.plot(val_losses, '-rx')
    plt.xlabel('epoch')
    plt.ylabel('loss')
    plt.legend(['Training', 'Validation'])
    plt.title('Loss vs. No. of epochs');
    
plot_losses(history)

In [None]:
def plot_accuracy(history):
        
    train_accu = [x.get('train_accu') for x in history]
    val_accu = [x['val_acc'] for x in history]
    plt.plot(train_accu, '-bx')
    plt.plot(val_accu, '-rx')
    plt.xlabel('epoch')
    plt.ylabel('Accuracy')
    plt.legend(['Training', 'Validation'])
    plt.title('Accuracy vs. No. of epochs');
plot_accuracy(history)

In [None]:
def plot_lrs(history):
    lrs = np.concatenate([x.get('lrs', []) for x in history])
    plt.plot(lrs)
    plt.xlabel('Batch no.')
    plt.ylabel('Learning rate')
    plt.title('Learning Rate vs. Batch no.');
    
plot_lrs(history)

## Evaluation on test dataset

In [None]:
## Evaluation with best validation accuracy model
model = LULC_Model()
model.load_state_dict(torch.load("./lulc_max_acc.pth"))
model.eval()
model = to_device(model, device)

In [None]:
@torch.no_grad()
def predict_dl(dl, model):
    torch.cuda.empty_cache()
    batch_pred, labels = [], []
    for xb, label in tqdm(dl):
        probs = model(xb)
        _, pred = torch.max(probs.cpu().detach(), dim=1)
        batch_pred.append(pred.cpu().detach())     
        labels.append(label)   
    return [x for x in batch_pred], \
            [x for x in labels]

In [None]:
device = get_device()
## Loading data to devide
batch_size =1
test_dl = DataLoader(test_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)
test_dl = DeviceDataLoader(test_dl, device)

In [None]:
predictions, labels = predict_dl(test_dl, model)
preds = [x.item() for x in predictions]
labs = [x.item() for x in labels]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(labs, preds)
cm = cm / cm.astype(np.float).sum(axis=1)
cm = np.round(cm, 4) * 100
acc = accuracy_score(labs, preds)
print(f'''Accuracy of test data is: {acc*100:.2f} %''' )

In [None]:
plt.figure(figsize = (15,8))
ax = sns.heatmap(cm, annot=True, fmt='g');
ax.set_title(f'Confusion Matrix for {approach} Approach');
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual');
plt.yticks(rotation=30) 
plt.xticks(rotation=30) 
## For the Tick Labels, the labels should be in Alphabetical order
ax.xaxis.set_ticklabels(CLASSES)
ax.yaxis.set_ticklabels(CLASSES)
plt.show()

### Prediction Reports

In [None]:
### Predict Single Images
def predict_single(image):
    xb = image.unsqueeze(0)
    xb = to_device(xb, device)
    preds = model(xb)
    _, prediction = torch.max(preds.cpu().detach(), dim=1)
    return decode_target(int(prediction), text_labels=True)

def plot_samples(img_dir, rows=3, cols=3):
    plt.figure()
    f, axs = plt.subplots(rows, cols, figsize=(16, 18))
    k = 120
    for i in range(rows):
        for j in range(cols):
            predicted = predict_single(test_ds[k][0])
            actual = decode_target(test_ds[k][1], text_labels=True)
            text = f"Actual: {actual} \n Predicted: {predicted}"
            axs[i, j].title.set_text(text)
            axs[i, j].imshow(test_ds[k][0].permute(1, 2, 0))
            
            k += 1

plot_samples(BASE_PATH)

## Evaluation on overall dataset

In [None]:
device = get_device()
## Loading data to devide
batch_size =1
test_ds = EuroSAT(DATA_DF, BASE_PATH, train_data_transform)
test_dl = DataLoader(test_ds, batch_size, shuffle=False, num_workers=2, pin_memory=True)
test_dl = DeviceDataLoader(test_dl, device)

In [None]:
predictions, labels = predict_dl(test_dl, model)
preds = [x.item() for x in predictions]
labs = [x.item() for x in labels]

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(labs, preds)
cm = cm / cm.astype(np.float).sum(axis=1)
cm = np.round(cm, 4) * 100
acc = accuracy_score(labs, preds)
print(f'''Accuracy of overall data is: {acc*100:.2f} %''' )

In [None]:
plt.figure(figsize = (15,8))
ax = sns.heatmap(cm, annot=True, fmt='g');
ax.set_title(f'Confusion Matrix for {approach} Approach');
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual');
plt.yticks(rotation=30) 
plt.xticks(rotation=30) 
## For the Tick Labels, the labels should be in Alphabetical order
ax.xaxis.set_ticklabels(CLASSES)
ax.yaxis.set_ticklabels(CLASSES)
plt.show()