<a href="https://colab.research.google.com/github/lzichi/Thin-Materials-ML/blob/main/RUN_2d_Fit_ResNet_(2022Summer).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, argparse, time, random
from functools import partial
from shutil import copyfile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, confusion_matrix

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets, transforms
import torch.optim as optim
import torchvision.models as models
from PIL import Image

from tqdm.notebook import tqdm

In [None]:
import os

def makedirs(*dirnames):
    for dirname in dirnames:
        if not os.path.exists(dirname):
            os.makedirs(dirname)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

random.seed(41)
np.random.seed(41)
torch.manual_seed(41)
torch.cuda.manual_seed_all(41)


In [None]:
def deactivate_batchnorm(model):
    if isinstance(model, nn.BatchNorm2d):
        model.reset_parameters()
        model.eval()
        with torch.no_grad():
            model.weight.fill_(1.0)
            model.bias.zero_()
            model.requires_grad = False

In [None]:
def train_test_split(data, test_portion=0.75, train_subset_portion=1., seed=1):
    """
    This function creates train and test data.
    The test data is fixed to test_portion * len(data) .
    The train data contains (1 - test_portion) * train_subset_portion.
    (1 - test_portion) * (1 - train_subset_portion) samples will be discarded. 
    """
    np.random.seed(seed)
    assert 'names' in data.columns, f'`names` column is not found in df.'

    full_data = data.copy()
    full_data['original_name'] = full_data.names.apply(
        lambda x: '_'.join(x.split('_')[int('aug' in x or 'raw' in x) + int('aug' in x):]))
    original_imgs = np.unique(full_data['original_name'])
    assert original_imgs.shape[0] == 332
    train_img = set(np.random.choice(original_imgs, 
                                 int((1-test_portion) * len(original_imgs)),
                                 False))
    train_idx = full_data.original_name.apply(lambda x: x in train_img)
    train_data = full_data[train_idx]
    test_data = full_data[~train_idx]

    if train_subset_portion < 1:
        # Use a subset to train the model.
        train_imgs = np.unique(train_data['original_name'])
        train_subimg = set(np.random.choice(train_imgs, 
                                    int(train_subset_portion * len(train_imgs)),
                                    False))
        train_subidx = train_data.original_name.apply(lambda x: x in train_subimg)
        train_data = train_data[train_subidx]
    
    train_data = train_data.drop('original_name', 1).reset_index()
    test_data = test_data.drop('original_name', 1).reset_index()
    
    return train_data, test_data

In [None]:
class FlakeDataset(Dataset):
    def __init__(self, df, raw_only, transform=None, material=None):
        paths, labels, materials, dat_type = [], [], [], []
        for idx, path in enumerate(df['paths']):
            file = path.split('/')[-1].split('.')[0].split('-')[-1]
            if '_crop' in file:
                file = file.split('_crop')[0]
            
            if material:
                _is_target = [file in mat for mat in material]
                if max(_is_target) is False:
                    continue
                # for mat in material:
                #     if file in mat: 
                #         break
                    # continue
            # else:
            if 'aug_' in path:
                dat_type_i = 'augment'
                if not raw_only:
                    dat_type.append(dat_type_i)
                    paths.append(path)
                    labels.append(df.labels[idx])
                    materials.append(file)
            else:
                dat_type_i = 'raw'
                dat_type.append(dat_type_i)
                paths.append(path)
                labels.append(df.labels[idx])
                materials.append(file)
            
            # print(valid)
            # raise NotADirectoryError()
        self.path = paths
        self.labels = torch.tensor(labels).float()
        self.materials = np.array(materials)
        self.dat_type = np.array(dat_type)
        
        if not transform:
            transform = transforms.Compose([transforms.ToTensor()])
        self.transform = transform

    def __len__(self):
        return len(self.path)
    
    def __getitem__(self, i):
        if torch.is_tensor(i):
            i = i.tolist()
            
        img = Image.open(self.path[i]).convert("RGB")
        img = self.transform(img)
        label = self.labels[[i]]
        material = self.materials[i]
        dat_type = self.dat_type[i]
        return img.float(), label, material, dat_type
    

In [None]:
# Hyper-paramters for dataset loading
# quantized k: 5, 10, 15, 20
k = 20
# subset portion: 0.1, 0.25, 0.5
train_subset_portion = 0.5
 
if k is None:
    # Augmented pickle
    data = pd.read_pickle(os.path.join('/content/drive/Shared drives/2d/data', 'pad_augment_data_final.pkl'))
    data['paths'] = data['paths'].apply(
        lambda x: '/content/drive/Shared drives/2d/data/pad_augment_data_final/' + x.split('/')[-1])
    root_path = '/content/drive/Shared drives/2d/results/to_compare/'
else:
    # Quantized Augmented pickle
    data = pd.read_pickle(os.path.join('/content/drive/Shared drives/2d/data', f'quantized{k}_pad_augment_data.pkl'))
    data['paths'] = data['paths'].apply(
        lambda x: f'/content/drive/Shared drives/2d/data/quantized{k}_pad_augment_data/' + x.split('/')[-1])
    root_path = f'/content/drive/Shared drives/2d/results/to_compare/quantized{k}/'

# Unified train-test split, and root_path preparation. 
train, test = train_test_split(data, train_subset_portion=train_subset_portion, test_portion=.25)
makedirs(root_path)

trainsize = len(train)



In [None]:
# Print material status only. Unused in BNN training. 
transform = transforms.Compose([transforms.Resize(size=(224, 224)),
                                transforms.ToTensor()])

specific_material = None
dataset = FlakeDataset(data, False, transform, material=specific_material)
mat = pd.Series(dataset.materials)
mat.value_counts()

MoSe2_on_si_PDMS    5456
MoSe2_on_Si         3472
MoSe2_on_PDMS       1364
dtype: int64

In [None]:
len(train)

3844

In [None]:
# Hyper-parameters for ResNet training
bsz = 4
epochs = 50
lr = 0.01
momentum = 0.9

In [None]:
transform = transforms.Compose([
    transforms.Resize(size=(224, 224)),
    transforms.ToTensor()])

materials = ['MoSe2_on_Si', 'MoSe2_on_si_PDMS', 'MoSe2_on_PDMS']

trainset = FlakeDataset(train, raw_only=False,
                       transform=transform, 
                       material=materials)

testset = FlakeDataset(test, raw_only=False,
                       transform=transform, 
                       material=materials)

train_loader = DataLoader(trainset, batch_size=bsz, shuffle=True, pin_memory=True)
test_loader = DataLoader(testset, batch_size=bsz, shuffle=True, pin_memory=True)

net = models.resnet18(pretrained=False)
fc_features = net.fc.in_features
net.fc = nn.Linear(fc_features, 1)
net = net.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)

train_accus, test_accus = [], []
for e in range(1, epochs + 1):
    train_loss = 0.
    train_correct = 0
    train_total = 0
    with tqdm(train_loader, desc=f"{e} / {epochs} epochs:") as t:
        net.train()
        for i, (inputs, labels, material, dat_type) in enumerate(t):
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            pred = (outputs > 0)
            train_loss += loss.item() * labels.shape[0]
            train_correct += (pred==labels).sum().item()
            train_total += labels.shape[0]
            if i < len(t)-1:
                t.set_postfix({"train_loss": train_loss/train_total, 
                               "train_acc": train_correct/train_total})
            else:
                val_loss = 0.
                val_correct = 0
                val_total = 0
                with torch.no_grad():
                    net.eval()
                    sep_corr = {'MoSe2_on_si_PDMS': 0., 'MoSe2_on_Si': 0., 'MoSe2_on_PDMS': 0.}
                    sep_total = {'MoSe2_on_si_PDMS': 0., 'MoSe2_on_Si': 0., 'MoSe2_on_PDMS': 0.}
                    for i, (inputs, labels, material, dat_type) in enumerate(test_loader):
                        inputs = inputs.to(device)
                        labels = labels.to(device)
                        outputs = net(inputs)
                        loss = criterion(outputs, labels)

                        pred = (outputs > 0)
                        val_loss += loss.item() * labels.shape[0]
                        val_correct += (pred==labels).sum().item()
                        val_total += labels.shape[0]

                        material = np.array(material)
                        for mat in ['MoSe2_on_si_PDMS', 'MoSe2_on_Si', 'MoSe2_on_PDMS']:
                            pred_mat = pred[material == mat]
                            label_mat = labels[material == mat]
                            sep_corr[mat] += (pred_mat == label_mat).sum().item()
                            sep_total[mat] += label_mat.shape[0]
                        t.set_postfix({"train_loss": train_loss / train_total,
                                       "train_acc": train_correct / train_total,
                                       "val_loss": val_loss / val_total, 
                                       "val_acc": val_correct / val_total,
                                       "val_acc: MoSe2_on_si_PDMS": sep_corr['MoSe2_on_si_PDMS'] / max(sep_total['MoSe2_on_si_PDMS'], 1),
                                       "val_acc: MoSe2_on_Si": sep_corr['MoSe2_on_Si'] / max(sep_total['MoSe2_on_Si'], 1),
                                       "val_acc: MoSe2_on_PDMS": sep_corr['MoSe2_on_PDMS'] / max(sep_total['MoSe2_on_PDMS'], 1),
                                    })
                    train_accus.append(train_correct / train_total)
                    test_accus.append(val_correct / val_total)


torch.save(net.state_dict(), 
           os.path.join(root_path,  f'resnet18_sub{train_subset_portion:4.2f}.torch'))

print(f'Finish all...trained model saved to \
       {os.path.join(root_path,  f"resnet18_sub{train_subset_portion:4.2f}.torch")}')

1 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

2 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

3 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

4 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

5 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

6 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

7 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

8 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

9 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

10 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

11 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

12 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

13 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

14 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

15 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

16 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

17 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

18 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

19 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

20 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

21 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

22 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

23 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

24 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

25 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

26 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

27 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

28 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

29 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

30 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

31 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

32 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

33 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

34 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

35 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

36 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

37 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

38 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

39 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

40 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

41 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

42 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

43 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

44 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

45 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

46 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

47 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

48 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

49 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

50 / 50 epochs::   0%|          | 0/961 [00:00<?, ?it/s]

Finish all...trained model saved to        /content/drive/Shared drives/2d/results/to_compare/quantized20/resnet18_sub0.50.torch


In [None]:
# NOTE: 
# 
# Below codes are optional, they are uselss for preparing statistics. 

In [None]:
from sklearn.metrics import plot_confusion_matrix
from sklearn.base import BaseEstimator, ClassifierMixin, is_classifier

class plotClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self):
        super().__init__()

    def transform(self, x):
        return x

    def predict(self, x):
        return self.transform(x)

plotter = plotClassifier()

In [None]:
labels = []
pred_labels = []
materials = []
dat_types = []

for i, (inputs, label, material, dat_type) in enumerate(test_loader):
    net.eval()
    inputs = inputs.to(device)
    outputs = net(inputs)

    pred = (outputs > 0)

    labels.append(label)
    pred_labels.append(pred)
    materials += material
    dat_types += dat_type

pred_labels = torch.cat(pred_labels).cpu().numpy()
labels = torch.cat(labels).cpu().numpy()

result = pd.DataFrame({'label': labels.squeeze(), 'pred': pred_labels.squeeze(), 'material': materials, 'data_type': dat_types})

In [None]:
print('Materials')
for mat in ['MoSe2_on_si_PDMS', 'MoSe2_on_Si', 'MoSe2_on_PDMS']:
    subset = result[(result.material == mat)]
    print(f'    {mat}: {accuracy_score(subset.label, subset.pred)}')

print('Data type')
for dat in ['augment', 'raw']:
    subset = result[(result.data_type == dat)]
    print(f'    {dat}: {accuracy_score(subset.label, subset.pred)}')

print('Data type and materials')
for mat in ['MoSe2_on_si_PDMS', 'MoSe2_on_Si', 'MoSe2_on_PDMS']:
    for dat in ['augment', 'raw']:
        subset = result[(result.material == mat) & (result.data_type == dat)]
        print(f'    {mat}: {dat}: {accuracy_score(subset.label, subset.pred)}')

In [None]:
import matplotlib.pyplot as plt

In [None]:
train_accus = np.array(train_accus)
test_sccus = np.array(test_accus)

plt.plot(np.arange(1, epochs + 1), train_accus, label='train accuracy')
plt.plot(np.arange(1, epochs + 1), test_accus, label='test accuracy')
plt.legend()

plt.savefig(os.path.join(root_path,  '&'.join(train_material) + '.png'))