In [2]:
import os
import torch
import torchvision
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
from torchvision.models import resnet18
from torchvision.utils import make_grid
import matplotlib.pyplot as plt
from PIL import Image

# Intel Image Dataset Classification Section

In [3]:
data_dir= '/kaggle/input/intel-image-classification'
transform=transforms.Compose([
transforms.Resize((224,224)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485,0.456,0.406],
                    std=[0.229,0.224,0.225])
])
train_dataset=ImageFolder(root=os.path.join(data_dir,'seg_train'),transform =transform)
test_dataset=ImageFolder(root=os.path.join(data_dir,'seg_test'),transform =transform)

train_loader=DataLoader(train_dataset,batch_size=32,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=32)

In [4]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=resnet18(pretrained=True)
for param in model.parameters():
    param.required_grad=False

model.fc=nn.Linear(model.fc.in_features,2)
model=model.to(device)

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 204MB/s]


In [5]:
criterion =nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.fc.parameters(),lr=1e-3)

In [1]:
import torch
print("✅ CUDA available:", torch.cuda.is_available())
print("🖥️ Current device:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")


✅ CUDA available: True
🖥️ Current device: Tesla P100-PCIE-16GB


In [6]:
def train_model(model, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for batch_idx, (images, labels) in enumerate(train_loader):
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

            if (batch_idx + 1) % 10 == 0 or (batch_idx + 1) == len(train_loader):
                print(f"  Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")
train_model(model, epochs=5)

  Batch 10/439 | Loss: 0.0057
  Batch 20/439 | Loss: 0.0006
  Batch 30/439 | Loss: 0.0003
  Batch 40/439 | Loss: 0.0003
  Batch 50/439 | Loss: 0.0002
  Batch 60/439 | Loss: 0.0004
  Batch 70/439 | Loss: 0.0002
  Batch 80/439 | Loss: 0.0002
  Batch 90/439 | Loss: 0.0002
  Batch 100/439 | Loss: 0.0004
  Batch 110/439 | Loss: 0.0003
  Batch 120/439 | Loss: 0.0002
  Batch 130/439 | Loss: 0.0002
  Batch 140/439 | Loss: 0.0001
  Batch 150/439 | Loss: 0.0002
  Batch 160/439 | Loss: 0.0002
  Batch 170/439 | Loss: 0.0002
  Batch 180/439 | Loss: 0.0001
  Batch 190/439 | Loss: 0.0001
  Batch 200/439 | Loss: 0.0007
  Batch 210/439 | Loss: 0.0001
  Batch 220/439 | Loss: 0.0001
  Batch 230/439 | Loss: 0.0001
  Batch 240/439 | Loss: 0.0001
  Batch 250/439 | Loss: 0.0001
  Batch 260/439 | Loss: 0.0001
  Batch 270/439 | Loss: 0.0001
  Batch 280/439 | Loss: 0.0002
  Batch 290/439 | Loss: 0.0001
  Batch 300/439 | Loss: 0.0001
  Batch 310/439 | Loss: 0.0001
  Batch 320/439 | Loss: 0.0002
  Batch 330/439 |

In [7]:
def evaluate(model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

evaluate(model)


Test Accuracy: 100.00%


In [9]:
model_save_path = "/kaggle/working/resnet_disaster_clean.pt"
torch.save(model.state_dict(), model_save_path)

In [10]:
import os
print(os.path.exists("/kaggle/working/resnet_disaster_clean.pt"))  # Should print True


True


# Xview2 Model

In [2]:
import os
import json
import pandas as pd

# Path to the folder containing JSON files
json_dir = "/kaggle/input/labels/labels"

data = []

# Loop through all JSON files
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        filepath = os.path.join(json_dir, filename)

        with open(filepath, 'r') as f:
            try:
                label_data = json.load(f)
                
                # You can customize this extraction as needed
                image_id = filename.replace('_post_disaster.json', '')
                features = label_data.get("features", {}).get("xy", [])
                label_type = label_data.get("metadata", {}).get("disaster_type", "unknown")

                data.append({
                    "image_id": image_id,
                    "num_features": len(features),
                    "disaster_type": label_type
                })
            except Exception as e:
                print(f"Error processing {filename}: {e}")

# Create a DataFrame and save as CSV
df = pd.DataFrame(data)
df.to_csv("/kaggle/working/train_labels.csv", index=False)

print("✅ CSV saved to /kaggle/working/train_labels.csv")


✅ CSV saved to /kaggle/working/train_labels.csv


In [1]:
!pip install -q gdown

In [16]:
import os
import tarfile
import torch
import torch.nn as nn
from torchvision import transforms
from torch.utils.data import Dataset,DataLoader
from PIL import Image
import pandas as pd
import glob
import shutil

# setting up device
device= torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading pretrained model
from torchvision.models import resnet18
model =resnet18(weights=None)
model.fc= nn.Linear(model.fc.in_features,2)
model.load_state_dict(torch.load("/kaggle/working/resnet_disaster_clean.pt",map_location=device))
model = model.to(device)

# Training hyperparameters
criterion =nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.parameters(),lr=1e-4)
EPOCHS=5
BATCH_SIZE=16

transform=transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor()
])

# Label mapping and loading the label CSV
import pandas as pd

# Load your JSON-to-CSV converted file
label_df = pd.read_csv("/kaggle/working/train_labels.csv")

# Convert to proper format for classification
# image_id becomes image_name with "_post_disaster" suffix
label_df["image_name"] = label_df["image_id"] + "_post_disaster"

# Define label based on `num_features` as a proxy for damage level
# This is arbitrary and can be changed if needed
def map_damage(num_features):
    if num_features == 0:
        return 0  # no-damage
    elif num_features <= 2:
        return 1  # minor
    elif num_features <= 5:
        return 2  # major
    else:
        return 3  # destroyed

label_df["label"] = label_df["num_features"].apply(map_damage)

# Final format required by dataset class
label_df = label_df[["image_name", "label"]]
label_dict = dict(zip(label_df.image_name, label_df.label))


In [18]:
# Preparing a Custom dataset for 50 pairs
class XView2Dataset(Dataset):
    def __init__(self, image_paths, transform=None):
        self.image_paths = image_paths
        self.transform = transform
        self.labels = [label_dict.get(os.path.basename(p).replace(".tif", ""), 0) for p in image_paths]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            image = self.transform(image)
        return image, label



In [19]:
# Training Function
def train_model(model, dataloader, optimizer, criterion, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

        print(f"Epoch {epoch+1}/{epochs} - Loss: {total_loss:.4f} - Acc: {100 * correct / total:.2f}%")


In [24]:
# Making a loop to process .tgz files parts in chunks
import tarfile
import io
from PIL import Image

def process_tgz_in_chunks(tgz_path, extract_dir, chunk_size=50):
    with tarfile.open(tgz_path, "r:gz") as tar:
        # Filter for post-disaster images
        post_members = [m for m in tar.getmembers() if "_post_disaster.tif" in m.name and m.isfile()]
        print(f"Total post-disaster images in archive: {len(post_members)}")

        for i in range(0, len(post_members), chunk_size):
            chunk_members = post_members[i:i+chunk_size]
            chunk_paths = []

            print(f"\nExtracting chunk {i}-{i+len(chunk_members)-1}...")

            for member in chunk_members:
                member_path = os.path.join(extract_dir, os.path.basename(member.name))
                chunk_paths.append(member_path)

                # Extract image to memory then save to disk
                f = tar.extractfile(member)
                if f is not None:
                    image = Image.open(io.BytesIO(f.read()))
                    image.save(member_path)

            # Train on this chunk
            dataset = XView2Dataset(chunk_paths, transform=transform)
            dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

            print(f"Training on {len(chunk_paths)} images...")
            train_model(model, dataloader, optimizer, criterion, epochs=EPOCHS)

            # Delete extracted images to free space
            for path in chunk_paths:
                if os.path.exists(path):
                    os.remove(path)

    print("✅ Finished processing archive chunk-by-chunk.")



In [25]:
file_ids = {
    "aa": "12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0",
    "ab": "1_N24_6Gj2zBfxTOPaOYUJz6zcucK4kzo",
    "ac": "1-xluFLB5YgbsZw8W4JDtNXNDukInWn5m",
    "ad": "1VKV9g3U8pS6dLByi3dKTj-Orc388v3mI",
    "ae": "1ylemTUM1_r6GM11sYXiS7ivawtLtdgxo",
    "af": "1QZlwXVKw8BUAnT1yL2W1Lu-Wynl3n0uQ",
}
#https://drive.google.com/file/d/12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0/view?usp=drive_link
#https://drive.google.com/file/d/1_N24_6Gj2zBfxTOPaOYUJz6zcucK4kzo/view?usp=sharing
#https://drive.google.com/file/d/1-xluFLB5YgbsZw8W4JDtNXNDukInWn5m/view?usp=sharing
#https://drive.google.com/file/d/1VKV9g3U8pS6dLByi3dKTj-Orc388v3mI/view?usp=sharing
#https://drive.google.com/file/d/1ylemTUM1_r6GM11sYXiS7ivawtLtdgxo/view?usp=sharing
#https://drive.google.com/file/d/1QZlwXVKw8BUAnT1yL2W1Lu-Wynl3n0uQ/view?usp=sharing

for part, file_id in file_ids.items():
    tgz_name = f"xview2_geotiff_part_{part}.tgz"
    extract_path = "/kaggle/working/tmp_extract"

    if os.path.exists(extract_path):
        shutil.rmtree(extract_path)
    os.makedirs(extract_path, exist_ok=True)

    print(f"\n📥 Downloading part {part}...")
    os.system(f"gdown https://drive.google.com/uc?id={file_id} -O {tgz_name}")

    process_tgz_in_chunks(tgz_name, extract_path, chunk_size=50)

    checkpoint_path = f"/kaggle/working/resnet_checkpoint_part_{part}.pt"
    torch.save(model.state_dict(), checkpoint_path)
    print(f"💾 Checkpoint saved: {checkpoint_path}")

# Final model save
torch.save(model.state_dict(), "/kaggle/working/resnet_disaster_final.pt")
print("✅ Final model saved.")


📥 Downloading part aa...


Downloading...
From (original): https://drive.google.com/uc?id=12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0
From (redirected): https://drive.google.com/uc?id=12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0&confirm=t&uuid=5a4e420e-f9f3-448c-986a-1c97c733ffd6
To: /kaggle/working/xview2_geotiff_part_aa.tgz
100%|██████████| 10.5G/10.5G [00:40<00:00, 259MB/s]


EOFError: Compressed file ended before the end-of-stream marker was reached

In [28]:
import os
corrupted_file = "/kaggle/working/xview2_geotiff_part_aa.tgz"
if os.path.exists(corrupted_file):
    os.remove(corrupted_file)
    print("🧹 Corrupted .tgz deleted. Ready to re-download.")


In [29]:
!gdown --fuzzy --id 12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0 -O /kaggle/working/xview2_geotiff_part_aa.tgz


Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0

but Gdown can't. Please check connections and permissions.


In [26]:
import os
import shutil
import glob

# Paths to preserve
keep_files = {
    "/kaggle/working/train_labels.csv",
    "/kaggle/working/resnet_disaster_clean.pt"
}

# Delete all files and folders in /kaggle/working except the important ones
for item in os.listdir("/kaggle/working"):
    item_path = os.path.join("/kaggle/working", item)
    if item_path not in keep_files:
        try:
            if os.path.isfile(item_path) or os.path.islink(item_path):
                os.remove(item_path)
            elif os.path.isdir(item_path):
                shutil.rmtree(item_path)
            print(f"🧹 Deleted: {item_path}")
        except Exception as e:
            print(f"❌ Could not delete {item_path}: {e}")

print("✅ Cleanup complete. Only train_labels.csv and resnet_disaster_clean.pt are kept.")


🧹 Deleted: /kaggle/working/tmp_extract
🧹 Deleted: /kaggle/working/xview2_geotiff_part_aa.tgz
✅ Cleanup complete. Only train_labels.csv and resnet_disaster_clean.pt are kept.


In [1]:
import shutil

# Adjust the input path to match your dataset's mount path
shutil.copy("/kaggle/input/resnet-disaster-clean/resnet_disaster_clean.pt", "/kaggle/working/")


'/kaggle/working/resnet_disaster_clean.pt'

In [3]:
import shutil

# Adjust the input path to match your dataset's mount path
shutil.copy("/kaggle/input/labels/labels.zip", "/kaggle/working/")


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/labels/labels.zip'

In [None]:
file_ids = {
    "aa": "12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0",
    "ab": "1_N24_6Gj2zBfxTOPaOYUJz6zcucK4kzo",
    "ac": "1-xluFLB5YgbsZw8W4JDtNXNDukInWn5m",
    "ad": "1VKV9g3U8pS6dLByi3dKTj-Orc388v3mI",
    "ae": "1ylemTUM1_r6GM11sYXiS7ivawtLtdgxo",
    "af": "1QZlwXVKw8BUAnT1yL2W1Lu-Wynl3n0uQ",
}
#https://drive.google.com/file/d/12t_PXcgNjAuxHDPi_pVtDT4YUQe3Piy0/view?usp=drive_link
#https://drive.google.com/file/d/1_N24_6Gj2zBfxTOPaOYUJz6zcucK4kzo/view?usp=sharing
#https://drive.google.com/file/d/1-xluFLB5YgbsZw8W4JDtNXNDukInWn5m/view?usp=sharing
#https://drive.google.com/file/d/1VKV9g3U8pS6dLByi3dKTj-Orc388v3mI/view?usp=sharing
#https://drive.google.com/file/d/1ylemTUM1_r6GM11sYXiS7ivawtLtdgxo/view?usp=sharing
#https://drive.google.com/file/d/1QZlwXVKw8BUAnT1yL2W1Lu-Wynl3n0uQ/view?usp=sharing

import torch
import torch.nn as nn
from torchvision.models import resnet18

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model =resnet18(pretrained =False)
model.fc=nn.Linear(model.fc.in_features,2)
model.load_state_dict(torch.load("/kaggle/working/resnet_disaster_clean.pt",map_location=device))
model=model.to(device)
criterion=nn.CrossEntropyLoss()
optimizer=torch.optim.Adam(model.fc.parameters(),lr=1e-4)

from torch.utils.data import DataLoader

def train_model(model,train_loader,epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss=0
        correct=0
        total=0
        for batch_idx,(images,labels) in emumerate(train_loader):
            images,labels = images.to(device),labels.to(device)
            optimizer.zero_grad()
            outputs=model(images)
            loss=criterion(ouputs,labels)
            loss.backward()
            optimizer.step()
            total_loss +=loss.item()
            _,predicted=torch.max(outputs,1)
            correct += (predicted == lables).sum().item()
            total += labels.size(0)
    
            if(batch_idx +1)%10==0 or (batch_idx +1)==len(train_loader):
                print(f"  Batch {batch_idx+1}/{len(train_loader)} | Loss: {loss.item():.4f}")
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {100 * correct / total:.2f}%")


from torchvision import datasets, transforms
transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor()
])


import os
for part,file_id in file_ids.items():
    tgz_filename= f"xview2_geotiff_part_{part}.tgz"
    extract_path= "/kaggle/working/xview2_data"

    !rm -rf {extract_path}
    os.makedirs(extract_path,exist_ok =True)
    print(f"\n Downloading {tgz_filename}..")
    !gdown --id {file_id} --output {tgz_filename}

    print(f" Extracting {tgz_filename}..")
    !tar -xvzf {tgz_filename} -C {extract_path}

    train_dataset=datasets.Imagefolder(root=extract_path,transform =transform)
    train_loader=DataLoader(train_dataset,batch_size=32,shuffle= True)

    print(f"Training on part {part}..")
    train_model(model,train_loader,epochs=5)

    checkpoint_path=f"/kaggle/working/resnet_finetuned_part_{part}.pth"
    torch.save(model.state_dict(),checkpoint_path)
    print(f"checkpoint saved : {checkpoint_path}")


torch.save(model.state_dict(),"/kaggle/working/final_resnet_disaster.pt")
print("Final model Saved")

In [9]:
import os
import tarfile
import shutil
from glob import glob
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from tqdm import tqdm
import pandas as pd

# ========== CONFIG ========== #
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels_csv_path = "/kaggle/working/train_labels.csv"  # Update if your file path is different
tgz_paths = [
    "/kaggle/input/xview2-geotiff-part-aa/xview2_geotiff_part_aa.tgz",
    "/kaggle/input/xview2-geotiff-part-ab1/xview2_geotiff_part_ab.tgz",
    "/kaggle/input/xview2-geotiff-part-ac/xview2_geotiff_part_ac.tgz",
    "/kaggle/input/xview2-geotiff-part-ad/xview2_geotiff_part_ad.tgz",
    "/kaggle/input/xview2-geotiff-part-ae/xview2_geotiff_part_ae.tgz",
    "/kaggle/input/xview2-geotiff-part-af/xview2_geotiff_part_af.tgz"
]
extract_dir = "/kaggle/temp"
chunk_size = 50

# ========== MODEL ========== #
class SatelliteDataset(Dataset):
    def __init__(self, image_paths, labels_df, transform=None):
        self.image_paths = image_paths
        self.labels_df = labels_df
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        path = self.image_paths[idx]
        image = Image.open(path).convert('RGB')
        image_id = os.path.basename(path).replace("_post_disaster.tif", "")
        label = self.labels_df.loc[self.labels_df["image_id"] == image_id, "label"].values[0]
        if self.transform:
            image = self.transform(image)
        return image, label

def get_model(num_classes):
    model = models.resnet18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model.to(device)

# ========== TRAIN FUNCTION ========== #
def train_one_epoch(model, dataloader, criterion, optimizer):
    model.train()
    total_loss, correct = 0, 0
    for images, labels in tqdm(dataloader, leave=False):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()
    return total_loss / len(dataloader), correct / len(dataloader.dataset)

# ========== PROCESS TGZ ========== #
def process_tgz_in_chunks(tgz_path, extract_base, chunk_size):
    with tarfile.open(tgz_path, "r:gz") as tar:
        post_members = [m for m in tar.getmembers() if "_post_disaster.tif" in m.name and m.isfile()]
        print(f"Found {len(post_members)} post-disaster images.")
        for i in range(0, len(post_members), chunk_size):
            chunk = post_members[i:i+chunk_size]
            chunk_dir = os.path.join(extract_base, f"chunk_{i}")
            os.makedirs(chunk_dir, exist_ok=True)
            for member in chunk:
                member.name = os.path.basename(member.name)  # Avoid nested folders
                tar.extract(member, path=chunk_dir)
            yield chunk_dir
            shutil.rmtree(chunk_dir)  # Clean up

# ========== LABEL MAPPING ========== #
labels_df = pd.read_csv(labels_csv_path)
# Define your own label logic if needed
def map_features_to_label(num_features):
    if num_features == 0:
        return 0
    elif num_features <= 5:
        return 1
    elif num_features <= 15:
        return 2
    else:
        return 3
labels_df["label"] = labels_df["num_features"].apply(map_features_to_label)

# ========== TRANSFORM ========== #
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# ========== LOOP THROUGH TGZ FILES ========== #
model = get_model(num_classes=4)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

for part_idx, tgz_path in enumerate(tgz_paths):
    print(f"\n==> Processing {os.path.basename(tgz_path)}")

    for chunk_dir in process_tgz_in_chunks(tgz_path, extract_dir, chunk_size):
        image_paths = sorted(glob(os.path.join(chunk_dir, "*.tif")))
        dataset = SatelliteDataset(image_paths, labels_df, transform)
        dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

        loss, acc = train_one_epoch(model, dataloader, criterion, optimizer)
        print(f"Chunk: Loss = {loss:.4f}, Accuracy = {acc:.4f}")

    checkpoint_path = f"/kaggle/working/resnet_checkpoint_part_{part_idx+1}.pt"
    torch.save(model.state_dict(), checkpoint_path)
    print(f"✅ Saved checkpoint: {checkpoint_path}")


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 186MB/s]



==> Processing xview2_geotiff_part_aa.tgz


EOFError: Compressed file ended before the end-of-stream marker was reached

In [10]:
import tarfile
from pathlib import Path

# List of your dataset files on Kaggle
dataset_paths = [
    "/kaggle/input/xview2-geotiff-part-aa/xview2_geotiff_part_aa.tgz",
    "/kaggle/input/xview2-geotiff-part-ab/xview2_geotiff_part_ab.tgz",
    "/kaggle/input/xview2-geotiff-part-ac/xview2_geotiff_part_ac.tgz",
    "/kaggle/input/xview2-geotiff-part-ad/xview2_geotiff_part_ad.tgz",
    "/kaggle/input/xview2-geotiff-part-ae/xview2_geotiff_part_ae.tgz",
    "/kaggle/input/xview2-geotiff-part-af/xview2_geotiff_part_af.tgz"
]

for path in dataset_paths:
    print(f"🔍 Testing {path}...")
    try:
        with tarfile.open(path, "r:gz") as tar:
            tar.getmembers()  # just list contents
        print("✅ File is OK.\n")
    except Exception as e:
        print(f"❌ File is BROKEN: {e}\n")


🔍 Testing /kaggle/input/xview2-geotiff-part-aa/xview2_geotiff_part_aa.tgz...
❌ File is BROKEN: Compressed file ended before the end-of-stream marker was reached

🔍 Testing /kaggle/input/xview2-geotiff-part-ab/xview2_geotiff_part_ab.tgz...
❌ File is BROKEN: [Errno 2] No such file or directory: '/kaggle/input/xview2-geotiff-part-ab/xview2_geotiff_part_ab.tgz'

🔍 Testing /kaggle/input/xview2-geotiff-part-ac/xview2_geotiff_part_ac.tgz...
❌ File is BROKEN: not a gzip file

🔍 Testing /kaggle/input/xview2-geotiff-part-ad/xview2_geotiff_part_ad.tgz...
❌ File is BROKEN: not a gzip file

🔍 Testing /kaggle/input/xview2-geotiff-part-ae/xview2_geotiff_part_ae.tgz...
❌ File is BROKEN: not a gzip file

🔍 Testing /kaggle/input/xview2-geotiff-part-af/xview2_geotiff_part_af.tgz...
❌ File is BROKEN: [Errno 2] No such file or directory: '/kaggle/input/xview2-geotiff-part-af/xview2_geotiff_part_af.tgz'



In [4]:
import pandas as pd

# Load your CSV
labels_df = pd.read_csv("/kaggle/working/train_labels.csv")

# Display first few rows
print("📄 Preview of labels_train.csv:")
print(labels_df.head())

# Display column names
print("\n🧩 Columns in CSV:")
print(labels_df.columns.tolist())

# Check for missing values
print("\n❗ Missing values check:")
print(labels_df.isnull().sum())

# Check data types
print("\n🔍 Data types:")
print(labels_df.dtypes)

# Count rows
print(f"\n🔢 Total rows: {len(labels_df)}")


📄 Preview of labels_train.csv:
                                         image_id  num_features disaster_type
0  santa-rosa-wildfire_00000138_pre_disaster.json           258          fire
1     hurricane-harvey_00000041_pre_disaster.json             3      flooding
2                      hurricane-matthew_00000295             6          wind
3                             socal-fire_00000723             7          fire
4    hurricane-michael_00000020_pre_disaster.json             2          wind

🧩 Columns in CSV:
['image_id', 'num_features', 'disaster_type']

❗ Missing values check:
image_id         0
num_features     0
disaster_type    0
dtype: int64

🔍 Data types:
image_id         object
num_features      int64
disaster_type    object
dtype: object

🔢 Total rows: 5598


In [5]:
# Apply this immediately after reading the CSV
labels_df["image_id"] = labels_df["image_id"].apply(
    lambda x: x.replace("_pre_disaster.json", "_post_disaster.tif")
)


In [6]:
print(labels_df.head())


                                         image_id  num_features disaster_type
0  santa-rosa-wildfire_00000138_post_disaster.tif           258          fire
1     hurricane-harvey_00000041_post_disaster.tif             3      flooding
2                      hurricane-matthew_00000295             6          wind
3                             socal-fire_00000723             7          fire
4    hurricane-michael_00000020_post_disaster.tif             2          wind
