### Usecase: Calculating Cloud Coverage from a Sky-Cam Image | Domain: Climate Patterns

### Set up

In [None]:
!pip install git+https://github.com/openai/CLIP.git

In [None]:
# !pip install git+https://github.com/openai/CLIP.git

import os, cv2, torch, clip, timm, pickle
import numpy as np
import pandas as pd
from PIL import Image

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

from torch import nn
from tqdm.autonotebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertConfig, DistilBertTokenizer

### Importing Data

In [None]:
df = pd.read_csv("/kaggle/input/sky-image-recent-dataset/cloud_data_cleaned1.csv")
df = df[['image_name', 'opaque_clouds']]
df.columns = ['image', 'cloudcover']
print("Total Records: ", len(df))
df.head()

### Train Test Split

In [None]:
x = df['image']
y = df['cloudcover']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 48)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size = 0.30, random_state = 48)

print((x_train.shape, x_val.shape, x_test.shape), (y_train.shape, y_val.shape, y_test.shape))

### Finetuned CLIP Model Loading

In [None]:
class CFG:
    debug = False
    #image_path = "/kaggle/input/sky-image-dataset/data/data"
    captions_path = "."
    batch_size = 64
    num_workers = 4
    head_lr = 1e-3
    image_encoder_lr = 1e-4
    text_encoder_lr = 1e-5
    weight_decay = 1e-3
    patience = 1
    factor = 0.8
    epochs = 12
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    # device = "cpu"

    model_name = 'resnet50'
    image_embedding = 2048
    text_encoder_model = "distilbert-base-uncased"
    text_embedding = 768
    text_tokenizer = "distilbert-base-uncased"
    max_length = 200

    pretrained = True # for both image encoder and text encoder
    trainable = True # for both image encoder and text encoder
    temperature = 1.0

    size = 224 

    # For projection head: used for both image and text encoders
    num_projection_layers = 1
    projection_dim = 256 
    dropout = 0.1

In [None]:
class CLIPModel(nn.Module):
    def __init__(
        self,
        temperature=CFG.temperature,
        image_embedding=CFG.image_embedding,
        text_embedding=CFG.text_embedding,
    ):
        super().__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
        self.temperature = temperature

    def forward(self, batch):
        # Getting Image and Text Features
        image_features = self.image_encoder(batch["image"])
        text_features = self.text_encoder(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        # Getting Image and Text Embeddings (with same dimension)
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)

        # Calculating the Loss
        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = cross_entropy(logits, targets, reduction='none')
        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
        loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
        return loss.mean()
    
    
    
class ImageEncoder(nn.Module):
    # Encode images to a fixed size vector
    def __init__(self, model_name=CFG.model_name, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained, num_classes=0, global_pool="avg")
        for p in self.model.parameters():
            p.requires_grad = trainable

    def forward(self, x):
        return self.model(x)
    
    
    
class TextEncoder(nn.Module):
    def __init__(self, model_name=CFG.text_encoder_model, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        if pretrained:
            self.model = DistilBertModel.from_pretrained(model_name)
        else:
            self.model = DistilBertModel(config=DistilBertConfig())
            
        for p in self.model.parameters():
            p.requires_grad = trainable

        # W are using the CLS token hidden representation as the sentence's embedding
        self.target_token_idx = 0

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]
    

    
class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=CFG.projection_dim,
        dropout=CFG.dropout
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)
    
    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x

In [None]:
model = CLIPModel().to(CFG.device)
model.load_state_dict(torch.load("/kaggle/input/sky-image-recent-dataset/best.pt", map_location = CFG.device))
model.eval()

### Dataset Preparation For Regression Model


In [None]:
class SkyImage(Dataset):
    def __init__(self, img_dir, labels): 
        self.img_dir = img_dir
        self.img_labels = labels

    def __len__(self):
        return len(self.img_dir)

    def __getitem__(self, idx):
        img_path = os.path.join("/kaggle/input/sky-image-recent-dataset/Extracted Images/Extracted Images", self.img_dir[idx])
        #image = Image.open(img_path).convert("RGB")
        #image = preprocess(image)
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (244, 244))
        image = np.moveaxis(image, -1, 0)
        
        label = self.img_labels[idx]
        return image, label

In [None]:
train_images = SkyImage(x_train.to_list(), y_train.to_list())
valid_images = SkyImage(x_val.to_list(), y_val.to_list())
test_images = SkyImage(x_test.to_list(), y_test.to_list())

### Building Features

In [None]:
def get_features(dataset):
    
    all_features = []
    all_labels = []
    all_embeddings = []
    
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size = 64)):
            image_input = torch.tensor(np.stack(images)).cuda().float()
            image_features = model.image_encoder(image_input)
            image_embeddings = model.image_projection(image_features)
            all_features.append(image_features)
            all_labels.append(labels)
            all_embeddings.append(image_embeddings)
        
    return torch.cat(all_features), torch.cat(all_labels).cuda(), torch.cat(all_embeddings).cuda()

In [None]:
valid_features, valid_labels, valid_embeddings = get_features(valid_images)
test_features, test_labels, test_embeddings = get_features(test_images)

In [None]:
train_features, train_labels, train_embeddings = get_features(train_images)

### Data Validation

In [None]:
print(len(train_features)==len(train_labels))
print(len(valid_features)==len(valid_labels))
print(len(test_features)==len(test_labels))

print(len(train_features), len(valid_features), len(test_features))

### Evaluation Metrics

In [None]:
def evaluate(name, x, y, n, p): #p: features, #n: no of observations
    print("---------------------------------------------------")
    print("{} MAE: {}".format(name, mean_absolute_error(x, y)))
    print("{} RMSE: {}".format(name, mean_squared_error(x, y, squared = False)))
    print("{} MSE: {}".format(name, mean_squared_error(x, y)))
    r2 = r2_score(x, y)
    print("{} R2: {}".format(name, r2))
    #adr = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
    #print("{} Adjusted R2: {}".format(name, adr))
    print("---------------------------------------------------")

### Catboost Model

In [None]:
CB_model = CatBoostRegressor(iterations = 700, learning_rate = 0.1, max_depth = 8, eval_metric = 'RMSE', random_seed = 48)

CB_model.fit(train_features.cpu().numpy(), train_labels.cpu().numpy(), 
             eval_set = (valid_features.cpu().numpy(), valid_labels.cpu().numpy()), 
             use_best_model = True, plot = True, verbose = 50)

In [None]:
cbt_train_pred = CB_model.predict(train_features.cpu().numpy())
cbt_valid_pred = CB_model.predict(valid_features.cpu().numpy())
cbt_test_pred = CB_model.predict(test_features.cpu().numpy())

In [None]:
# print(min(train_labels.cpu()), max(train_labels.cpu()))
# print(min(cbt_train_pred), max(cbt_train_pred))

# print(min(valid_labels.cpu()), max(valid_labels.cpu()))
# print(min(cbt_valid_pred), max(cbt_valid_pred))

# print(min(test_labels.cpu()), max(test_labels.cpu()))
# print(min(cbt_test_pred), max(cbt_test_pred))

In [None]:
evaluate("Train", train_labels.cpu(), cbt_train_pred, len(cbt_train_pred), 1)
evaluate("Valid", valid_labels.cpu(), cbt_valid_pred, len(cbt_valid_pred), 1)
evaluate("Test", test_labels.cpu(), cbt_test_pred, len(cbt_test_pred), 1)

### Saving Model

In [None]:
pickle.dump(CB_model, open('cbr_featured_model.sav', 'wb'))

### Testing & Inference

In [None]:
class CFG:
    debug = False
    captions_path = "."
    batch_size = 64
    num_workers = 4
    head_lr = 1e-3
    image_encoder_lr = 1e-4
    text_encoder_lr = 1e-5
    weight_decay = 1e-3
    patience = 1
    factor = 0.8
    epochs = 12
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = "cpu"
    model_name = 'resnet50'
    image_embedding = 2048
    text_encoder_model = "distilbert-base-uncased"
    text_embedding = 768
    text_tokenizer = "distilbert-base-uncased"
    max_length = 200
    pretrained = True # for both image encoder and text encoder
    trainable = True # for both image encoder and text encoder
    temperature = 1.0
    size = 224 
    # For projection head: used for both image and text encoders
    num_projection_layers = 1
    projection_dim = 256 
    dropout = 0.1

In [None]:
class CLIPModel(nn.Module):
    def __init__(
        self,
        temperature=CFG.temperature,
        image_embedding=CFG.image_embedding,
        text_embedding=CFG.text_embedding,
    ):
        super().__init__()
        self.image_encoder = ImageEncoder()
        self.text_encoder = TextEncoder()
        self.image_projection = ProjectionHead(embedding_dim=image_embedding)
        self.text_projection = ProjectionHead(embedding_dim=text_embedding)
        self.temperature = temperature

    def forward(self, batch):
        # Getting Image and Text Features
        image_features = self.image_encoder(batch["image"])
        text_features = self.text_encoder(
            input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]
        )
        # Getting Image and Text Embeddings (with same dimension)
        image_embeddings = self.image_projection(image_features)
        text_embeddings = self.text_projection(text_features)

        # Calculating the Loss
        logits = (text_embeddings @ image_embeddings.T) / self.temperature
        images_similarity = image_embeddings @ image_embeddings.T
        texts_similarity = text_embeddings @ text_embeddings.T
        targets = F.softmax(
            (images_similarity + texts_similarity) / 2 * self.temperature, dim=-1
        )
        texts_loss = cross_entropy(logits, targets, reduction='none')
        images_loss = cross_entropy(logits.T, targets.T, reduction='none')
        loss =  (images_loss + texts_loss) / 2.0 # shape: (batch_size)
        return loss.mean()

In [None]:
class ImageEncoder(nn.Module):
    # Encode images to a fixed size vector
    def __init__(self, model_name=CFG.model_name, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        self.model = timm.create_model(model_name, pretrained, num_classes=0, global_pool="avg")
        for p in self.model.parameters():
            p.requires_grad = trainable

    def forward(self, x):
        return self.model(x)
    
    
    
class TextEncoder(nn.Module):
    def __init__(self, model_name=CFG.text_encoder_model, pretrained=CFG.pretrained, trainable=CFG.trainable):
        super().__init__()
        if pretrained:
            self.model = DistilBertModel.from_pretrained(model_name)
        else:
            self.model = DistilBertModel(config=DistilBertConfig())
            
        for p in self.model.parameters():
            p.requires_grad = trainable

        # W are using the CLS token hidden representation as the sentence's embedding
        self.target_token_idx = 0

    def forward(self, input_ids, attention_mask):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = output.last_hidden_state
        return last_hidden_state[:, self.target_token_idx, :]
    

    
class ProjectionHead(nn.Module):
    def __init__(
        self,
        embedding_dim,
        projection_dim=CFG.projection_dim,
        dropout=CFG.dropout
    ):
        super().__init__()
        self.projection = nn.Linear(embedding_dim, projection_dim)
        self.gelu = nn.GELU()
        self.fc = nn.Linear(projection_dim, projection_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(projection_dim)
    
    def forward(self, x):
        projected = self.projection(x)
        x = self.gelu(projected)
        x = self.fc(x)
        x = self.dropout(x)
        x = x + projected
        x = self.layer_norm(x)
        return x

In [None]:
# Custom DataLoader
class SkyImage(Dataset):
    def __init__(self, img, label): 
        self.img = img
        self.img_label = label
    def __len__(self):
        return len(self.img)
    def __getitem__(self, idx):
#         image = cv2.cvtColor(self.img[idx], cv2.COLOR_BGR2RGB)
#         image = cv2.resize(image, (244, 244))
#         image = np.moveaxis(image, -1, 0)
#         label = self.img_label[idx]
#         return image, label
    
        img_path = os.path.join("/kaggle/input/sky-image-recent-dataset/Extracted Images/Extracted Images", self.img[idx])
        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (244, 244))
        image = np.moveaxis(image, -1, 0)
        label = self.img_label[idx]
        return image, label


# Generate Features using Custom Pretrained Clip
def get_features(clip_model, dataset):
    features, label, embeddings = [], [], []
    with torch.no_grad():
        for images, labels in tqdm(DataLoader(dataset, batch_size = 64)):
            image_input = torch.tensor(np.stack(images)).cpu().float()
            image_features = clip_model.image_encoder(image_input)
            image_embeddings = clip_model.image_projection(image_features)
            features.append(image_features)
            label.append(labels)
            embeddings.append(image_embeddings)
    return torch.cat(features), torch.cat(label).cpu(), torch.cat(embeddings)

In [None]:
# Loading Clip Model & XgBoost Model

CTBR_model = pickle.load(open("/kaggle/working/cbr_featured_model.sav", 'rb'))
clip_model = CLIPModel().to(CFG.device)
clip_model.load_state_dict(torch.load("/kaggle/input/sky-image-recent-dataset/best.pt", map_location = CFG.device))
clip_model.eval()

In [None]:
# Validation Dataset Predictions

for i in range(250):
    img_path = ["/kaggle/input/sky-image-recent-dataset/Extracted Images/Extracted Images/" + x_val.to_list()[i]]
    lbl = [y_val.to_list()[i]] 
    
    valid_image = SkyImage(img_path, lbl)
    valid_features, valid_labels, valid_embeddings = get_features(clip_model, valid_image)

    # Prediction on a sample image
    yp_cbt = CTBR_model.predict(valid_features.cpu().numpy())
    if yp_cbt < 0.0:
        yp_cbt = 0.0
    if yp_cbt > 100.0:
        yp_cbt = 100.0

    # Prediction vs Actual
    print("CatBoost: Actual Cloud Coverage: {} | Predicted Cloud Coverage:{}".format(round(lbl[0], 2), round(yp_cbt[0], 2)))

In [None]:
# Test Dataset Predictions

for i in range(250):
    # Reading Random Image
    img_path = ["/kaggle/input/sky-image-recent-dataset/Extracted Images/Extracted Images/" + x_test.to_list()[i]]
    lbl = [y_test.to_list()[i]] 
    # print(img_path, lbl) [img_path], [target]
    
    # Processing Image and generating features using Clip
    test_image = SkyImage(img_path, lbl)
    test_features, test_labels, test_embeddings = get_features(clip_model, test_image)

    # Prediction on a sample image
    yp_cbt = CTBR_model.predict(test_features.cpu().numpy())
    if yp_cbt < 0.0:
        yp_cbt = 0.0
    if yp_cbt > 100.0:
        yp_cbt = 100.0

    # Prediction vs Actual
    print("CatBoost: Actual Cloud Coverage: {} | Predicted Cloud Coverage:{}".format(round(lbl[0], 2), round(yp_cbt[0], 2)))