In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/cs-480-2024-spring/data/sample_submission.csv
/kaggle/input/cs-480-2024-spring/data/target_name_meta.tsv
/kaggle/input/cs-480-2024-spring/data/train.csv
/kaggle/input/cs-480-2024-spring/data/test.csv


In [None]:
# define dataset and model
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as torch_models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image


# Custom Dataset
class PlantDataset(Dataset):
    def __init__(self, image_paths, ancillary_data, targets, transform=None):
        self.transform = transform
        self.images = []
        for i in range(len(image_paths)): 
            image = Image.open(image_paths[i]).convert('RGB')
            if self.transform:
                image = self.transform(image)
            self.images.append(image)
        
        self.ancillary_data = ancillary_data
        self.targets = targets
        df = pd.DataFrame([])
        self.dfType = type(df)
        if type(self.targets) == self.dfType:
            self.targets = targets.values
        id_column = "id"
        self.id_data = self.ancillary_data[id_column]
        self.id_data = self.id_data.values
        self.ancillary_data = self.ancillary_data.drop(columns=[id_column])
        self.ancillary_data = self.ancillary_data.values
        self.npType = type(self.ancillary_data)
    
    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        
        ancillary = self.ancillary_data[idx]
        target = []
        if type(self.targets) == self.npType:
            target = self.targets[idx]
        image_id = self.id_data[idx]
        
        return image, image_id, torch.tensor(ancillary, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)
    

class PlantDatasetUnloaded(Dataset):
    def __init__(self, image_paths, ancillary_data, targets, transform=None):
        self.transform = transform
        self.image_paths = image_paths
        
        self.ancillary_data = ancillary_data
        self.targets = targets
        df = pd.DataFrame([])
        self.dfType = type(df)
        if type(self.targets) == self.dfType:
            self.targets = targets.values
        id_column = "id"
        self.id_data = self.ancillary_data[id_column]
        self.id_data = self.id_data.values
        self.ancillary_data = self.ancillary_data.drop(columns=[id_column])
        self.ancillary_data = self.ancillary_data.values
        self.npType = type(self.ancillary_data)
    
    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = Image.open(self.image_paths[i]).convert('RGB')
        if self.transform:
            image = self.transform(image)
        
        ancillary = self.ancillary_data[idx]
        target = []
        if type(self.targets) == self.npType:
            target = self.targets[idx]
        image_id = self.id_data[idx]
        
        return image, image_id, torch.tensor(ancillary, dtype=torch.float32), torch.tensor(target, dtype=torch.float32)

        
# Model Definition
class PlantModel(nn.Module):
    def __init__(self, output_size, resnet_feature_size, ancillary_feature_size):
        super(PlantModel, self).__init__()
        self.output_size = output_size
        self.resnet = torch_models.resnet50(weights=torch_models.ResNet50_Weights.DEFAULT)
        # Freeze all layers
#         for param in self.resnet.parameters():
#             param.requires_grad = False
#         for param in self.resnet.fc.parameters(): 
#             param.requires_grad = True
#         for param in self.resnet.layer4.parameters(): 
#             param.requires_grad = True
        

        # Replace the final fully connected layer
        num_features = self.resnet.fc.in_features
        self.resnet.fc = nn.Linear(num_features, resnet_feature_size)  # num_classes is the number of output classes
        
        self.fc1 = nn.Linear(resnet_feature_size + ancillary_feature_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)  
        self.fc4 = nn.Linear(128, self.output_size)
        
        self.dropout1 = nn.Dropout(p=0.5)
        self.dropout2 = nn.Dropout(p=0.5)
        self.dropout3 = nn.Dropout(p=0.5)
        
        self.activation1 = nn.ReLU()
        self.activation2 = nn.ReLU()
        self.activation3 = nn.ReLU()
        

    def forward(self, image, ancillary):
        image_features = self.resnet(image)
        x = torch.cat((image_features, ancillary), dim=1)
        x = self.activation1(self.fc1(x))
        x = self.dropout1(x)
        x = self.activation2(self.fc2(x))
        x = self.dropout2(x)
        x = self.activation3(self.fc3(x))
        x = self.dropout3(x)
        x = self.fc4(x)
        return x



In [None]:
train_df = pd.read_csv("/kaggle/input/cs-480-2024-spring/data/train.csv")

In [None]:
def remove_outliers_zscore(df, id_column, threshold=3):
    """
    Remove outliers from a dataset using the Z-score method.

    Parameters:
    - df: pandas DataFrame containing the dataset.
    - threshold: Z-score threshold for identifying outliers.

    Returns:
    - df_clean: DataFrame with outliers removed.
    """
    z_scores = np.abs((df - df.mean()) / df.std())
    z_scores[id_column] = 0
    z_scores.iloc[:, -6:] = 0
    df_clean = df[(z_scores < threshold).all(axis=1)]
    return df_clean

In [None]:
# remove outliers and split into ancillary data and targets
train_df = remove_outliers_zscore(train_df, "id")
train_ancillary_data = train_df.iloc[:, :-6]
train_targets = train_df.iloc[:, -6:]

In [None]:
# normalize targets using min max scaling
min_target_val = train_targets.min()
max_target_val = train_targets.max()

normalized_train_targets = (
    (train_targets - min_target_val) / (max_target_val - min_target_val)
)

In [None]:
# normalize ancillary training data (excluding the id column) using min max scaling 
id_column = "id"
id_data = train_ancillary_data[id_column]
train_ancillary_data = train_ancillary_data.drop(columns=[id_column])
min_val = train_ancillary_data.min()
max_val = train_ancillary_data.max()

normalized_train_ancillary_data = (
    (train_ancillary_data - min_val) / (max_val - min_val)
)
normalized_train_ancillary_data[id_column] = id_data

In [None]:
# Get training image paths
train_images_dir = "/kaggle/input/cs-480-2024-spring/data/train_images"
test_images_dir = "/kaggle/input/cs-480-2024-spring/data/test_images"

train_image_paths = []
test_image_paths = []
for i in range(len(normalized_train_ancillary_data)): 
    image_id = int(normalized_train_ancillary_data.iloc[i].id)
    filename = str(image_id) + ".jpeg"
    path = os.path.join(train_images_dir, filename)
    train_image_paths.append(path)

In [None]:
# Initialize training dataset and dataloader
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomRotation(30),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = PlantDataset(
    train_image_paths, normalized_train_ancillary_data, 
    normalized_train_targets, transform=transform
)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
# Initialize the model
from sklearn.metrics import r2_score

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("device = ", device)
output_size = 6
model = PlantModel(
    output_size, 
    resnet_feature_size=2048, 
    ancillary_feature_size=163
).to(device)

In [None]:
# Train the model
import matplotlib.pyplot as plt

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# R2 Score Function
def calculate_r2_score(y_true, y_pred):
    y_true = y_true.detach().cpu().numpy()
    y_pred = y_pred.detach().cpu().numpy()
    return r2_score(y_true, y_pred)

num_epochs = 1
r2_epoch_scores = []
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    r2_scores = []
    for idx, data in enumerate(train_dataloader):
        images, image_ids, ancillary_data, targets = data
        images, ancillary_data, targets = images.to(device), ancillary_data.to(device), targets.to(device)
        
        optimizer.zero_grad()
        outputs = model(images, ancillary_data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        batch_r2 = calculate_r2_score(targets, outputs)

        r2_scores.append(batch_r2)
        if idx % 100 == 0: 
            mean_loss = running_loss / (idx + 1)
            mean_r2_score = np.mean(r2_scores)
            print(idx, mean_loss, mean_r2_score)
    
    epoch_loss = running_loss / len(train_dataloader)
    epoch_r2 = np.mean(r2_scores)
    r2_epoch_scores.append(epoch_r2)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}, R2 Score: {epoch_r2:.4f}')

plt.plot(r2_epoch_scores)
print("Training complete")

In [None]:
# load and preprocess test data
test_ancillary_data = pd.read_csv("/kaggle/input/cs-480-2024-spring/data/test.csv")
id_column = "id"
test_id_data = test_ancillary_data[id_column]
test_ancillary_data = test_ancillary_data.drop(columns=[id_column])

normalized_test_ancillary_data = (
    (test_ancillary_data - min_val) / (max_val - min_val)
)
normalized_test_ancillary_data[id_column] = test_id_data 

test_image_paths = []
for i in range(len(normalized_test_ancillary_data)): 
    image_id = int(normalized_test_ancillary_data.iloc[i].id)
    filename = str(image_id) + ".jpeg"
    path = os.path.join(test_images_dir, filename)
    test_image_paths.append(path)

In [None]:
# Initialize test dataset and dataloader
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_dataset = PlantDatasetUnloaded(
    test_image_paths, normalized_test_ancillary_data, 
    None, transform=test_transform
)
test_dataloader = DataLoader(ensemble_test_dataset, batch_size=32)

In [None]:
# Make predictions and save them in csv
predictions = []
ids = []

model.eval()
with torch.no_grad():
    for images, id_batch, ancillary_data, _ in test_dataloader:
        images, ancillary_data = images.to(device), ancillary_data.to(device)
        
        pred_batch = model(images, ancillary_data)
        pred_batch = pred_batch.cpu().numpy()
        # unscale preds
        for pred in pred_batch:
            pred = (pred * (max_target_val - min_target_val)) + min_target_val
            predictions.append(pred)
        ids.extend(id_batch.numpy())

# Save predictions to CSV

col_labels = train_df.iloc[:, -6:].columns.values
predictions_df = pd.DataFrame(predictions, columns=col_labels)
predictions_df.insert(0, 'id', ids)
predictions_df.to_csv('predictions.csv', index=False)