In [2]:
import torch
import torchvision
from torch import nn 
from torchvision import transforms
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import Image 
import numpy as np
from scipy.io import loadmat
from scipy.io import savemat 
import sys
sys.path.append('../')
import dataset_utils

In [3]:
def get_imgs_bold_id(image_dataset,df):
  img2dna = dict()
  not_found_images = []
  for i, row in df.iterrows():
        url = row['image_urls'].split('|')[0]
        genus_name = row['genus_name'].replace(' ','_')
        image_name_csv ='image_dataset/' + genus_name + '/' + url[url.rfind('/')+1:]
        trovato = False
        for img in image_dataset.imgs:
            if img[0] == image_name_csv:
                img2dna[img[0]]= row['processid']
                trovato = True
                break
        if not trovato:
            not_found_images.append(image_name_csv)
  return img2dna

In [13]:
import pandas as pd
import requests
import threading
import time

df = pd.read_csv('unknown_species_new_samples.csv')
df['nucleotide'] = None  
missing_bolds = []
batch_size = 500
start_time = time.time()

def fetch_data(batch_ids, index):
    url = f"http://v3.boldsystems.org/index.php/API_Public/sequence?ids={'|'.join(batch_ids)}&format=tsv"
    try:
        response = requests.get(url)
        response.raise_for_status()
        body = response.text

        lines = body.split('\r\n')
        for i in range(0, len(lines) - 1, 2): 
            info_line = lines[i]
            sequence_line = lines[i + 1]

            if info_line and sequence_line:
                bold_id = info_line.lstrip('>').split('|')[0]
                nucleotide = sequence_line.strip() 

                df.loc[df['processid'] == bold_id, 'nucleotide'] = nucleotide
    except Exception as e:
        print(f"Error fetching data for batch {index}: {e}")
        missing_bolds.extend(batch_ids)

    if index % 100 == 0:
        elapsed_time = time.time() - start_time
        print(f"Processed {index} batches, elapsed time: {elapsed_time:.2f} seconds")
    time.sleep(1)

threads = []
for i in range(0, len(df), batch_size):
    batch_ids = df['processid'][i:i + batch_size].tolist()
    thread = threading.Thread(target=fetch_data, args=(batch_ids, i // batch_size))
    threads.append(thread)
    thread.start()

for thread in threads:
    thread.join()

df.to_csv('updated_species_samples.csv', index=False)

with open("missing_bolds.txt", "w") as f:
    for bold_id in missing_bolds:
        f.write(bold_id + "\n")

total_time = time.time() - start_time
print(f"Processing completed in {total_time:.2f} seconds")
    





Processed 0 batches, elapsed time: 11.32 seconds
Processing completed in 125.69 seconds


In [27]:
df = df.dropna(subset = ['nucleotide'])

In [54]:
tform = transforms.Compose([transforms.Resize((64,64)),transforms.PILToTensor(),transforms.ConvertImageDtype(torch.float),transforms.Normalize(0.5,0.5)])
image_dataset = torchvision.datasets.ImageFolder("image_dataset/",transform=tform)

batch_size = 1000 

import random
import pickle

with open('genusname2genuslabel.pickle', 'rb') as handle:
    genusname2genuslabel = pickle.load(handle)

for i, (imgpath, specieslabel) in enumerate(image_dataset.imgs):
    imgpath = imgpath.replace("image_dataset/","")
    #print(imgpath)
    image_dataset.imgs[i] = (image_dataset.imgs[i][0],genusname2genuslabel[imgpath[:imgpath.index("/")]])

def new_unseen_get_imgs_bold_id(image_dataset,df):
  img2dna = dict()
  not_found_images = []
  for i, row in df.iterrows():
      url = row['image_urls'].split('|')[0]
      genus_name = row['genus_name']
      image_name_csv ='image_dataset/'+genus_name+'/'+url[url.rfind('/')+1:]
      
      trovato = False
      for img in image_dataset.imgs:
          if img[0] == image_name_csv:
              img2dna[img[0]]= row['processid']
              trovato = True
              break
      if not trovato:
          not_found_images.append(image_name_csv)
  return img2dna
img2dna = new_unseen_get_imgs_bold_id(image_dataset,df)

nucleotides = df[['nucleotide','species_name','genus_name','processid','image_urls']]

colonna_dna = df.loc[:,"nucleotide"]
#nucleotides.loc[:,'nucleotide'] = colonna_dna.apply(dataset_utils.one_hot_encoding)
nucleotides['string_nucleotides'] = nucleotides['nucleotide']
nucleotides.loc[:,'nucleotide'] = colonna_dna.apply(dataset_utils.one_hot_encoding)
random.seed(42)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nucleotides['string_nucleotides'] = nucleotides['nucleotide']


In [55]:
img2dna_indices = dict()
for k,v in img2dna.items():
    #print(k)
    #print(v)
    dna_index = np.where(nucleotides['processid'].values == v)
    if dna_index[0].size > 0:
        #print(dna_index)
        dna_index = dna_index[0][0]
        
        for i,(name,_) in enumerate(image_dataset.imgs):
            if name == k:
                image_index = i
                break
        img2dna_indices[image_index] = dna_index
    else:
        raise Exception

In [59]:
len(img2dna_indices)

40050

In [58]:

all_not_expanded_one_hots = nucleotides['nucleotide'].to_numpy()
all_not_expanded_string_dnas= nucleotides['string_nucleotides'].to_numpy()
all_not_expanded_one_hots.shape

(40354,)

In [66]:
all_dnas = []
all_string_dnas = []
all_dna_labels = []
already_seen_dna_indices = set()
is_first_occurrence = []
not_missing = 0
for i in range(len(image_dataset.imgs)):
    if i not in img2dna_indices:
        print(f"skipping: image:{i} does not have a DNA")
        continue
    else:
        not_missing+=1
    all_dnas.append(torch.tensor(all_not_expanded_one_hots[img2dna_indices[i]]))
    all_string_dnas.append(all_not_expanded_string_dnas[img2dna_indices[i]])
    all_dna_labels.append(torch.tensor(image_dataset.imgs[i][1]))
    if img2dna_indices[i] not in already_seen_dna_indices:
        is_first_occurrence.append(True)
        already_seen_dna_indices.add(img2dna_indices[i])
    else:
        is_first_occurrence.append(False)
all_dnas = torch.stack(all_dnas)
all_dna_labels = torch.stack(all_dna_labels)
all_string_dnas = np.array(all_string_dnas)

skipping: image:6 does not have a DNA
skipping: image:43 does not have a DNA
skipping: image:667 does not have a DNA
skipping: image:698 does not have a DNA
skipping: image:1530 does not have a DNA
skipping: image:1531 does not have a DNA
skipping: image:1585 does not have a DNA
skipping: image:1586 does not have a DNA
skipping: image:1587 does not have a DNA
skipping: image:1588 does not have a DNA
skipping: image:1589 does not have a DNA
skipping: image:1590 does not have a DNA
skipping: image:1591 does not have a DNA
skipping: image:1592 does not have a DNA
skipping: image:1593 does not have a DNA
skipping: image:1594 does not have a DNA
skipping: image:1595 does not have a DNA
skipping: image:1596 does not have a DNA
skipping: image:1598 does not have a DNA
skipping: image:1599 does not have a DNA
skipping: image:1600 does not have a DNA
skipping: image:1601 does not have a DNA
skipping: image:1602 does not have a DNA
skipping: image:1603 does not have a DNA
skipping: image:1693 do

In [68]:
all_dnas.shape
not_missing

40050

In [71]:
len(image_dataset.imgs)

41258

In [80]:
from torch.utils.data import Dataset, DataLoader
batch_size = 1000
class WholeDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.targets = data.targets#torch.tensor(targets)
        #self.transform = transform
        
    def __getitem__(self, index):
        x = self.data[index][0]
        y = self.targets[index]
        
        return x, y
    
    def __len__(self):
        return len(self.data)
        
whole_dataset = WholeDataset(image_dataset)
n_classes = np.unique(whole_dataset.targets).shape[0]
whole_loader = torch.utils.data.DataLoader(whole_dataset, batch_size=1,shuffle=False, num_workers=0)


batch_images_list = []
batch_image_labels_list = []
with torch.no_grad():
    for i,(batch, targets) in enumerate(whole_loader):
        if i not in img2dna_indices:
            continue # we skip if an image doesnt have a DNA
        batch_images_list.append(batch.numpy())
        batch_image_labels_list.append(targets.numpy()) 
all_images= np.concatenate(batch_images_list)
all_labels= np.concatenate(batch_image_labels_list)

In [81]:
boldids = dataset_utils.image_filenames_from_df(df)

In [89]:
all_dataset = dict()
all_dataset['all_images'] = all_images 
all_dataset['all_dnas'] = all_dnas.numpy()
all_dataset['all_string_dnas'] = all_string_dnas
all_dataset['all_genus_labels'] = (all_labels+1)
all_dataset['all_boldids']= np.array(boldids)
savemat('unseen_insect_dataset.mat',all_dataset)