# Kunisch Features from ResNet architectures 
## Seminario de Tesis I, Primavera 2022 
### MDS Program. University of Chile.
#### Supervisor: Prof. Benjamín Bustos, Prof. Iván Sipirán
#### Author: Iván Sipirán, modified by Matías Vergara


## Imports

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import numpy as np
import matplotlib.pyplot as plt
from torchvision import datasets, models, transforms
import time
import os
import copy
import pandas as pd
import math
import random
import shutil

from torch.utils.data import Dataset
from PIL import Image

from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import numpy as np, scipy.io
import argparse
import json


ModuleNotFoundError: No module named 'torch'

## Mounting Google Drive

In [6]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    folder_path = 'drive/MyDrive/TesisMV/'
except:
    folder_path = './'

In [7]:
os.listdir(folder_path)

['.ipynb_checkpoints',
 'Feature extraction through ResNet.ipynb',
 'Full pipeline.ipynb',
 'Labels normalization.ipynb',
 'ResNet retraining.ipynb',
 'Split and augmentation.ipynb',
 'Testing multilabel algorithms.ipynb']

## Dataset loader

In [None]:
class PatternDataset(Dataset):
    def __init__(self, root_dir, transform=None, build_classification=False, name_cla='output.cla'):
        self.root_dir = root_dir
        self.transform = transform
        self.namefiles = []

        self.namefiles = sorted(os.listdir(self.root_dir))
        print(f'Files: {len(self.namefiles)}')
        
    
    def __len__(self):
        return len(self.namefiles)

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = index.tolist()
        
        img_name = os.path.join(self.root_dir, self.namefiles[index])
        image = Image.open(img_name).convert("RGB")
        
        if self.transform:
            image = self.transform(image)
            
        return self.namefiles[index], image

## Preparation

In [None]:
random.seed(30)
pathOriginals = folder_path + "patterns/originals"
pathSintetics = folder_path + "patterns/sintetics"

train_df = pd.read_json(folder_path + "labels/augmented_train_df.json", orient='index')
val_df = pd.read_json(folder_path + "labels/val_df.json", orient='index')
test_df = pd.read_json(folder_path + "labels/test_df.json", orient='index')
train_pts = train_df.index.values
val_pts = val_df.index.values
test_pts = test_df.index.values

#Copiar archivos de train
for elem in train_pts:
  if len(elem.split("_")) > 1: # si los patterns tienen apellido, buscar en sintetics
    shutil.copy(os.path.join(pathSintetics,elem,elem+'.png'), 
                os.path.join(folder_path + "temp/train"))
  else:
    shutil.copy(os.path.join(pathOriginals,elem,elem+'.png'), 
          os.path.join(folder_path + "temp/train"))


#Copiar archivos de val
for elem in val_pts:
  shutil.copy(os.path.join(pathOriginals,elem,elem+'.png'), 
        os.path.join(folder_path + "temp/val"))
  
#Copiar archivos de test
for elem in test_pts:
  shutil.copy(os.path.join(pathOriginals,elem,elem+'.png'), 
        os.path.join(folder_path + "temp/test"))


## Extraction

In [None]:
def imshow(inp, title = None):
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)

    plt.imshow(inp)
    plt.show()

def get_vector(model,layer, dim_embedding, x):

  my_embedding = torch.zeros(dim_embedding)

  def copy_data(m,i,o):
    my_embedding.copy_(o.data.squeeze())

  h = layer.register_forward_hook(copy_data)
  model(x)
  h.remove()

  return my_embedding

data_path = folder_path + "temp"
model_path = folder_path + "/models/resnet18_8158.pth"
output_train = folder_path + "/features/augmented_train_df.json"
output_val = folder_path + "/features/val_df.json"
output_test = folder_path + "/features/test_df.json"

device = ('cuda' if torch.cuda.is_available() else 'cpu')

my_transform = transforms.Compose([ transforms.Resize(224),
                                    #transforms.CenterCrop(224),
                     transforms.ToTensor(),
                     transforms.Normalize(mean=[0.485, 0.456, 0.406],std = [0.229, 0.224, 0.225])
                    ])

dataTrain = PatternDataset(root_dir=os.path.join(data_path, 'train'), transform=my_transform)
dataVal = PatternDataset(root_dir=os.path.join(data_path, 'val'), transform=my_transform)
dataTest = PatternDataset(root_dir=os.path.join(data_path, 'test'), transform=my_transform)

loaderTrain = DataLoader(dataTrain)
loaderVal = DataLoader(dataVal)
loaderTest = DataLoader(dataTest)


model = models.resnet18(pretrained = True)
dim = model.fc.in_features
model.fc = nn.Linear(dim, 6)

model = model.to(device)

try:
    model.load_state_dict(torch.load(model_path))
except RuntimeError as e:
    print('Ignoring "' + str(e) + '"')

layer = model._modules.get('avgpool')

model.eval()

#features = []
features_train = {}
features_val = {}
features_test = {}

for name, img in loaderTrain:
  feat = get_vector(model, layer, dim, img.to(device))
  namefile = name[0]
  code, rest = namefile.split('.')
  features_train[code] = feat.numpy().tolist()
  #features.append(feat.numpy())

for name, img in loaderVal:
  feat = get_vector(model, layer, dim, img.to(device))
  namefile = name[0]
  code, rest = namefile.split('.')
  features_val[code] = feat.numpy().tolist()
  #features.append(feat.numpy())

for name, img in loaderTest:
  feat = get_vector(model, layer, dim, img.to(device))
  namefile = name[0]
  code, rest = namefile.split('.')
  features_test[code] = feat.numpy().tolist()
#features = np.vstack(features)
#print(features.shape)

#with open(opt.output, 'wb') as f:
#  np.save(f, features)
print(len(features_train.keys()))
print(len(features_val.keys()))
print(len(features_test.keys()))

Files: 3669
Files: 78
Files: 156


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

3669


In [None]:
features_train_df = pd.DataFrame.from_dict(features_train, orient='index')
# features_val_df = pd.DataFrame.from_dict(features_val, orient='index')
# features_test_df = pd.DataFrame.from_dict(features_test, orient='index')

features_train_df.to_json(output_train, orient='index')
# features_val_df.to_json(output_val, orient='index')
# features_test_df.to_json(output_test, orient='index')

In [None]:
features_train_df.to_json("augmented_train_df.json", orient='index')