<a href="https://colab.research.google.com/github/nmamatsashvili/Similarity-Seach-Notebooks/blob/master/Similarity_Search_WideResNet_batch_moded.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load required modules and data

In [None]:
import torch
# or WRN-101-2
model = torch.hub.load('pytorch/vision:v0.10.0', 'wide_resnet101_2', pretrained=True)
model.eval()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install -U sentence-transformers

# Directory indexing and function defining

In [None]:
import os
import re
from PIL import Image

dirIndexed = []
dirMain = "/content/drive/MyDrive/Colab Notebooks/data/ResponsesNewMerge"
dirResults = "/content/drive/MyDrive/Colab Notebooks/data/ResponsesNewMergeResult"
startIndx = 4801
endIndx = 5600
modelName = "WideResNet"
npVecFileName = f"NumpyVecs_{modelName}_{startIndx}-{endIndx}.npy"
npVecCombined = f"NumpyVecsCombined_{modelName}"


def sorted_alphanumeric(data):
    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [ convert(c) for c in re.split('([0-9]+)', key) ] 
    return sorted(data, key=alphanum_key)


def GetImgIndxByName(fileName):
  for item in dirIndexed:
    if item[1] == fileName:
      return item[0]
  return 0

def GetImgNameByIndx(index):
  for item in dirIndexed:
    if item[0] == index:
      return item[1]
  return ""

def getImageListInRange(_startIndx, _endIndx):
  imgs = []
  i = 0
  for arr in dirIndexed:
    if arr[0] >= _startIndx and arr[0] <= _endIndx:
      if os.fsdecode(arr[1]).endswith(".jpg") == False:
            continue
      imgs.append(Image.open(f"{dirMain}/{arr[1]}").convert('RGB'))
      i+=1
      n = int((_endIndx - _startIndx) / 5)
      if i % n == 0:
        print(f"{i} image appended in list")
    
  return imgs


dir = sorted_alphanumeric(os.listdir(dirMain))
i = 0
while i < len(dir):
  dirIndexed.append([i, dir[i]])
  i += 1

#CALL FROM HERE

#rs = getImageListInRange(0, 8)
#print(rs[0])
#print(rs[151])
#print(len(rs))



#print(GetImgIndxByName("5244045_2.jpg"))
#print(GetImgNameByIndx(151))

#print(dirIndexed)
#print(dirIndexed[0])
#print(dirIndexed[1])
#print(dirIndexed[2])

#print(len(dir))
#print(dir)

# Calculate vectors

In [None]:
from torchvision import transforms
import numpy
import gc
import torch

preprocess = transforms.Compose([
      transforms.Resize(256),
      transforms.CenterCrop(224),
      transforms.ToTensor(),
      transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])

if os.path.exists(f"{dirResults}/{npVecFileName}") == False:
  imgList = getImageListInRange(startIndx, endIndx)

  inputs = [preprocess(img) for img in imgList]
  print("Input length " + str(len(inputs)))

  inputStack = torch.stack(inputs, axis=0)

  if torch.cuda.is_available():
      input_batch = inputStack.to('cuda')
      model.to('cuda')
  else:
    input_batch = inputStack

  with torch.inference_mode():
      vecs = model(input_batch)

  embeddings = numpy.array(vecs.cpu())
  numpy.save(f'{dirResults}/{npVecFileName}', embeddings)
  print(f"Vector saved in {dirResults}/{npVecFileName}")
else:
  print(f"Vector already calculated in {dirResults}/{npVecFileName}")
#  embeddings = numpy.load(f'{dirMain}/{npVecFileName}')


print("Cleaning up GPU memory and RAM...")
imgList = None
inputs = None
vecs = None
input_batch = None
inputStack = None
embeddings = None
gc.collect() 
torch.cuda.empty_cache()
print("Memory is clean")



159 image appended in list
318 image appended in list
477 image appended in list
636 image appended in list
input length 727
Cleaning up GPU memory and RAM...
Memory is clean


# Concantenate vectors

In [None]:
import numpy
import os

numpyLst = []

dirResultSorted = sorted_alphanumeric(os.listdir(dirResults))
for item in dirResultSorted:
  if item.startswith("NumpyVecs_"):
    numpyLst.append(numpy.load(f"{dirResults}/{item}"))

NpArr = numpy.concatenate(numpyLst, axis=0)
numpy.save(f'{dirResults}/{npVecCombined}', NpArr)
print("Numpy array concatenation completed")

Numpy array concatenation completed


# Calculations

In [None]:
import numpy
import time
import json
from sentence_transformers import util

embeddings = numpy.load(f'{dirResults}/{npVecCombined}.npy')
print("start computing of cosine distances...")
print(f"Embedding length {len(embeddings)}")
start = time.perf_counter()

dataFile = f"{dirResults}/{modelName}ResultJson_{startIndx}-{endIndx}.txt"
if os.path.exists(dataFile) :
    openFile = open(dataFile,"r+")
    openFile.truncate()
    openFile.close()

openFile = open(dataFile, "a")


jList = []

cnt = 0
i = 0
j = 0
while i < len(embeddings):
    fileName = GetImgNameByIndx(i)
    j = i
    while j < len(embeddings):
        fileNameNext = GetImgNameByIndx(j)
        isNextSame = True if int(fileName.split('_')[0]) == int(fileNameNext.split('_')[0]) else False
        if i != j and isNextSame == False:
            CosScore = util.cos_sim(embeddings[i], embeddings[j])
            jList.append({
                'i': i, 'j':j, 'id1': fileName, 'id2':fileNameNext, "descr_score":str(CosScore), "counter":cnt
            })
            
            cnt += 1
        j += 1
    i += 1
    
json.dump(jList, openFile, indent=2)
print("End computation")
end = time.perf_counter()
openFile.close()
print("Total time: " + str( round(end - start, 2)) + " seconds")

start computing of cosine distances...
Embedding length 5528
