In [None]:
#@title Connect to google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Installs
!pip install torch torchvision transformers open_clip_torch

In [None]:
#@title Material Descriptions and CSV Loading

model_name = 'ViT-L-14' # ['RN50', 'ViT-B-32', 'ViT-L-14']

# Load test CSV
csv_path = "test/roof_dataset_clip_prompts_test.csv"  # Adjust path as needed
df = pd.read_csv(csv_path)

# Class prompts
material_descriptions = {
    "Thatch": "thatch roof",
    "GreenVegetative": "roof with vegetation on it",
    "StoneSlates": "dark stone slate roof",
    "ClayTiles": "clay tile / ceramic tile roof",
    "AsphaltTiles": "asphalt shingle pitched roof",
    "ConcreteTiles": "tiled concrete / tiled cement roof",
    "WoodTiles": "wood shingle roof",
    "MetalSheetMaterials": "corrugated or tiled metal roof (silver / dark / painted)",
    "PolycarbonateSheetMaterials": "polycarbonate roof",
    "GlassSheetMaterials": "glass roof (clear or mirrored)",
    "AmorphousConcrete": "flat concrete / cement roof",
    "AmorphousAsphalt": "asphalt-coated roof (bitumen layer or rolled roofing)",
    "AmorphousMembrane": "membrane roof (bright EPDM / TPO)",
    "AmorphousFabric": "tensile fabric roof (PVC / PTFE / canvas)",
    "Unknown": "unknown material, image may be too low resolution or obstructed"
}
classes = list(material_descriptions.keys())
prompts = list(material_descriptions.values())


In [None]:
#@title Imports
import os
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
import open_clip
from torchvision import transforms
import torch
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download


In [None]:
#@title Test RoofNet Fine-tuned RemoteCLIP Model
model, preprocess_train, preprocess = open_clip.create_model_and_transforms(model_name)
tokenizer = open_clip.get_tokenizer(model_name)

ckpt = torch.load('models/roofnetxclip_model_subset.pth')
message = model.load_state_dict(ckpt)
print(message)


# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
classes = list(material_descriptions.keys())
prompts = list(material_descriptions.values())
class_to_idx = {cls: i for i, cls in enumerate(classes)}

# Encode prompts
with torch.no_grad():
    text_tokens = tokenizer(prompts).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

# Evaluate
correct = 0
total = 0
true_labels = []
pred_labels = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join("test", row["image"])  # <-- Replace with your actual path
    true_class = row["image"].split("/")[0]

    if not os.path.exists(img_path):
        continue

    image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        sims = image_features @ text_features.T
        pred_class = classes[sims.argmax().item()]

    true_labels.append(class_to_idx[true_class])
    pred_labels.append(class_to_idx[pred_class])

# === Compute Accuracy ===
correct = sum([t == p for t, p in zip(true_labels, pred_labels)])
total = len(true_labels)
print(f"\n✅ CLIP Top-1 Accuracy: {correct / total * 100:.2f}% ({correct}/{total})")

# === Generate and Plot Confusion Matrix ===
cm = confusion_matrix(true_labels, pred_labels, labels=list(range(len(classes))))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(xticks_rotation=90, ax=ax, cmap='viridis')
plt.title("CLIP Material Classification Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
#@title Test RoofNet Fine-tuned RemoteCLIP Model with Class-Imbalanced Training
model, preprocess_train, preprocess = open_clip.create_model_and_transforms(model_name)
tokenizer = open_clip.get_tokenizer(model_name)

ckpt = torch.load('models/roofnetxclip_model_subset_balanced.pth')
message = model.load_state_dict(ckpt)
print(message)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
classes = list(material_descriptions.keys())
prompts = list(material_descriptions.values())
class_to_idx = {cls: i for i, cls in enumerate(classes)}

# Encode prompts
with torch.no_grad():
    text_tokens = tokenizer(prompts).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

# Evaluate
correct = 0
total = 0
true_labels = []
pred_labels = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join("test", row["image"])  # <-- Replace with your actual path
    true_class = row["image"].split("/")[0]

    if not os.path.exists(img_path):
        continue

    image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        sims = image_features @ text_features.T
        pred_class = classes[sims.argmax().item()]

    true_labels.append(class_to_idx[true_class])
    pred_labels.append(class_to_idx[pred_class])

# === Compute Accuracy ===
correct = sum([t == p for t, p in zip(true_labels, pred_labels)])
total = len(true_labels)
print(f"\n✅ CLIP Top-1 Accuracy: {correct / total * 100:.2f}% ({correct}/{total})")

# === Generate and Plot Confusion Matrix ===
cm = confusion_matrix(true_labels, pred_labels, labels=list(range(len(classes))))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(xticks_rotation=90, ax=ax, cmap='viridis')
plt.title("CLIP Material Classification Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
#@title Test CLIP OOB ViT-14/L

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
classes = list(material_descriptions.keys())
prompts = list(material_descriptions.values())
class_to_idx = {cls: i for i, cls in enumerate(classes)}

# Encode prompts
with torch.no_grad():
    text_tokens = tokenizer(prompts).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

# Evaluate
correct = 0
total = 0
true_labels = []
pred_labels = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join("test", row["image"])  # <-- Replace with your actual path
    true_class = row["image"].split("/")[0]

    if not os.path.exists(img_path):
        continue

    image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        sims = image_features @ text_features.T
        pred_class = classes[sims.argmax().item()]

    true_labels.append(class_to_idx[true_class])
    pred_labels.append(class_to_idx[pred_class])

# === Compute Accuracy ===
correct = sum([t == p for t, p in zip(true_labels, pred_labels)])
total = len(true_labels)
print(f"\n✅ CLIP Top-1 Accuracy: {correct / total * 100:.2f}% ({correct}/{total})")

# === Generate and Plot Confusion Matrix ===
cm = confusion_matrix(true_labels, pred_labels, labels=list(range(len(classes))))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(xticks_rotation=90, ax=ax, cmap='viridis')
plt.title("CLIP Material Classification Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
#@title Test OOB RemoteCLIP Model


for model_name in ['ViT-L-14']:
    checkpoint_path = hf_hub_download("chendelong/RemoteCLIP", f"RemoteCLIP-{model_name}.pt", cache_dir='checkpoints')
    print(f'{model_name} is downloaded to {checkpoint_path}.')
model_name = 'ViT-L-14'
model, preprocess_train, preprocess = open_clip.create_model_and_transforms(model_name)
tokenizer = open_clip.get_tokenizer(model_name)

path_to_your_checkpoints = 'checkpoints/models--chendelong--RemoteCLIP/snapshots/bf1d8a3ccf2ddbf7c875705e46373bfe542bce38'

ckpt = torch.load(f"{path_to_your_checkpoints}/RemoteCLIP-{model_name}.pt", map_location="cpu")
message = model.load_state_dict(ckpt)
print(message)

# Load CLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()
classes = list(material_descriptions.keys())
prompts = list(material_descriptions.values())
class_to_idx = {cls: i for i, cls in enumerate(classes)}

# Encode prompts
with torch.no_grad():
    text_tokens = tokenizer(prompts).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

# Evaluate
correct = 0
total = 0
true_labels = []
pred_labels = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    img_path = os.path.join("test", row["image"])  # <-- Replace with your actual path
    true_class = row["image"].split("/")[0]

    if not os.path.exists(img_path):
        continue

    image = preprocess(Image.open(img_path).convert("RGB")).unsqueeze(0).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features /= image_features.norm(dim=-1, keepdim=True)

        sims = image_features @ text_features.T
        pred_class = classes[sims.argmax().item()]

    true_labels.append(class_to_idx[true_class])
    pred_labels.append(class_to_idx[pred_class])

# === Compute Accuracy ===
correct = sum([t == p for t, p in zip(true_labels, pred_labels)])
total = len(true_labels)
print(f"\n✅ CLIP Top-1 Accuracy: {correct / total * 100:.2f}% ({correct}/{total})")

# === Generate and Plot Confusion Matrix ===
cm = confusion_matrix(true_labels, pred_labels, labels=list(range(len(classes))))
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=classes)
fig, ax = plt.subplots(figsize=(12, 12))
disp.plot(xticks_rotation=90, ax=ax, cmap='viridis')
plt.title("CLIP Material Classification Confusion Matrix")
plt.tight_layout()
plt.show()