In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup, SoupStrainer
import numpy as np
import re
import matplotlib.pyplot as plt

from glob import glob

import pandas as pd
from tqdm.auto import tqdm
from bs4 import BeautifulSoup

from time import sleep

from skimage import io

from concurrent.futures import ThreadPoolExecutor

import json
import pickle

import os

from datetime import datetime

In [2]:
def get_soup_simple(url, strain_name=None, strain_attrs=None):
    response = requests.get(url)#, headers=headers, proxies=proxies)
    if strain_name is None and strain_attrs is None:
        soup = BeautifulSoup(response.text, 'lxml')
    elif strain_attrs is None:
        strainer = SoupStrainer(strain_name)
        soup = BeautifulSoup(response.text, 'lxml', parse_only=strainer)
    else:
        strainer = SoupStrainer(strain_name, strain_attrs)
        soup = BeautifulSoup(response.text, 'lxml', parse_only=strainer)
    return soup

In [3]:
base_url = 'https://araneae.nmbe.ch/'
families_url = base_url + 'list/families'

main_page = get_soup_simple(families_url, 'tbody')
families = [x.text for x in main_page.tbody('b')]
links_to_species_list = [base_url+x['href'] for x in main_page.tbody('a') if x.text.lower() == 'species']

## Pull all species names

In [4]:
info = []

for family, link_to_species_list in tqdm(zip(families, links_to_species_list), total=len(families)):
    sleep(0.1)
    f_data = get_soup_simple(link_to_species_list, 'tbody')
    
    links_to_species = [base_url + x['href'] for x in f_data('a') if 'data' in x['href']]
    data_strings = [x.text for x in f_data('em')]
    species_list = []
    for data_string in data_strings:
        if any(x.isupper() for x in data_string):
            genus = data_string
            continue
        else:
            species = data_string
        species_list.append([genus, species])
    species_list = np.array(species_list)

    for genus, species, species_link in zip(*species_list.T, links_to_species):
        info.append([family, genus, genus + ' ' + species, species_link])
info = np.array(info)

df = pd.DataFrame(info, columns=['family', 'genus', 'species', 'link'])
df.head()

HBox(children=(FloatProgress(value=0.0, max=64.0), HTML(value='')))




Unnamed: 0,family,genus,species,link
0,Agelenidae,Agelena,Agelena agelenoides,https://araneae.nmbe.ch//data/2235
1,Agelenidae,Agelena,Agelena atlantea,https://araneae.nmbe.ch//data/5412
2,Agelenidae,Agelena,Agelena canariensis,https://araneae.nmbe.ch//data/5413
3,Agelenidae,Agelena,Agelena labyrinthica,https://araneae.nmbe.ch//data/637
4,Agelenidae,Agelena,Agelena longipes,https://araneae.nmbe.ch//data/2236


In [6]:
df.to_json('data/spiderbase_1.json')

In [7]:
df = pd.read_json('data/spiderbase_1.json')

In [9]:
def group_images(img_content, species):
    groups = {'habitus': [], 'epigyne': [], 'pedipalp': [], 'vulve': [], 'other': []}
    for img in img_content:
        name = img['name'].lower()
        if 'epigyne' in name:
            groups['epigyne'].append(img)
        elif 'pedipalp' in name:
            groups['pedipalp'].append(img)
        elif 'vulve' in name:
            groups['vulve'].append(img)
        elif 'habitus' in name:
            groups['habitus'].append(img)
        else:
            groups['other'].append(img)
    return groups
    
def get_image_metainfo_quick(args):
    url, species_name = args
    img_content = []
    sleep(0.1)
    try:
        img_data = get_soup_simple(url, 'div', {'class': 'thumbnail'})('div', 'thumbnail')
        for dt in img_data:
            sleep(0.1)
            name = dt.p.text.lstrip()
            credits = dt.a['title']
            link = base_url+dt.img['src']
            image = io.imread(link)
            img_content.append({'name': name, 'image': image , 'credits': credits})

        groups = group_images(img_content, species_name)
        with open(f'data/image_data_1/{species_name.replace(" ","-").lower()}.pkl','wb') as out_file:
            pickle.dump(groups, out_file)
        return True, args, ''
    except Exception as e:
        return False, args, e

## Remove data which is deprecated

In [10]:
existing_data = glob('data/image_data_1/*.pkl')
updated_data = [f'data/image_data_1/{species_name.replace(" ","-").lower()}.pkl' for species_name in df.species]

for data in existing_data:
    if data not in updated_data:
        print(f'Deleting {data}')
        os.system(f'rm -rf {data}')

## Download any new data

In [11]:
arg_list= df[['link', 'species']].values.astype(str)

results = []
for arg in tqdm(arg_list):
    species_name = arg[1]
    fname = f'data/image_data_1/{species_name.replace(" ","-").lower()}.pkl'
    if not os.path.isfile(fname):
        results.append(get_image_metainfo_quick(arg))

HBox(children=(FloatProgress(value=0.0, max=5231.0), HTML(value='')))




## Re-download updated images

In [12]:
latest_update = '20-03-2020'
latest_update = datetime.strptime(latest_update, '%d-%m-%Y').date()

In [13]:
dates = []
species = []

all_updates = False
i = 0

while not all_updates:
    sleep(0.1)
    data = get_soup_simple(f'https://araneae.nmbe.ch/logs?page={i}', 'table', {'class': 'table'})('tr')[1:]
    print(f'Number of parsed pages: {i+1}')
    dates.extend([datetime.strptime(x.td.text.replace('\n','').replace(' ',''), '%d-%m-%Y').date() for x in data])
    species.extend([x.a.text for x in data])
    if dates[-1] < latest_update:
        all_updates = True
    else:
        i += 1
        
dates = np.array(dates)
species = np.array(species)
species = np.unique(species[dates > latest_update])

Number of parsed pages: 1


In [14]:
arg_list_update = df[df.species.isin(species)][['link', 'species']].values.astype(str)

results = []
for arg in tqdm(arg_list_update):
    results.append(get_image_metainfo_quick(arg))

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


