In [None]:
import requests
from scrapy.selector import Selector
import os 
import numpy as np

In [None]:
class BirdScraper:
    def __init__(self) -> None:
        basic_url = 'https://www.bird-sounds.net'
        self.__basic_url = basic_url
        alphabetical_url = basic_url + '/alphabetical'
        self.__header = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36 OPR/102.0.0.0',
            'Upgrade-Insecure-Requests': '1',
            'Accept-Language': 'es-ES,es;q=0.9',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
        }
        self.__basic_response = Selector(
            requests.get(alphabetical_url, headers=self.__header))
        self.__data_path = './data/'
        self.__poss_conservation_status = ['least concern', 'threatened', 'extinct']


    def __get_birds_hrefs(self) -> list:
        all_urls = self.__basic_response\
            .xpath('//div[@class="bird-outer"]/a/@href').getall()
        return all_urls


    def __generate_bird_name(bird_href:str) -> str:
        bird_href = bird_href.replace('-', ' ')
        bird_href = bird_href.replace('/', '')
        return bird_href


    def get_all_bird_names(self) -> list:
        all_urls = self.__get_birds_hrefs()
        all_names = [
            BirdScraper.__generate_bird_name(bird) 
            for bird in all_urls]
        return all_names

    # ------------------------------------------------------------------------ #
    def __scrap_bird_audio(self, bird_name, url, audio_extension) -> None:
        audio = requests.get(url, headers=self.__header)
        folder_path = self.__data_path + 'birds audio/' + bird_name
        file = 'call.' + audio_extension 
        # Cambiar por lo que refleja exactamente el audio
        if not os.path.exists(folder_path):
            os.mkdir(folder_path)
        with open(folder_path + '/' + file, 'wb') as f:
            f.write(audio.content)


    def href_bird_tracks(self):
        for sub_href in self.__get_birds_hrefs():
            session = requests.Session()
            href = self.__basic_url + '/' + sub_href
            response = Selector(session.get(href, headers=self.__header))
            sub_audio = response.xpath(
                '//audio[@id="birdaudio"]/source/@src').get()
            audio_extension =  response.xpath(
                '//audio[@id="birdaudio"]/source/@type').get()
            audio_extension = audio_extension.split('/')[-1]
            audio_type = response.xpath(
                '//audio[@id="birdaudio"]/source/@src').getall()
            if len(audio_type) > 1:
                raise ValueError(f'Más de dos pestañas en {href}')
            audio_url = self.__basic_url + '/' + sub_audio
            bird_name = BirdScraper.__generate_bird_name(sub_href)
            self.__scrap_bird_audio(bird_name, audio_url, audio_extension)
    
    
    def __get_url_wikipedia(bird_name:str) -> str:
        basic_url = 'http://en.wikipedia.org/wiki/'
        words = bird_name.split(' ')
        new_words = []
        len_words = len(words)
        for word, i in zip(words, range(len_words)):
            word = word.lower()
            if i == 0: word = word.capitalize()
            if (i == (len_words - 2)) & word.endswith('s'): word = word[:-1] + r'%27s'
            new_words.append(word)
        url = basic_url + '_'.join(new_words) # + '.com'
        print(url)
        return url
    

    def __scrap_wiki_data(self, response):
        biota_table = response.xpath('//table[@class="infobox biota"]')#.get()

        conservation_status = biota_table.xpath(
            './/td[@colspan="2"]//a/text()').getall()
        conservation_status = [
            el for el in conservation_status 
            if el.lower() in self.__poss_conservation_status][0]
        # está mal
        classification = biota_table.xpath(
            './/tr//a/text()').getall()
        print(classification)
        conservation_status = conservation_status
        print(conservation_status)


    def get_wiki_bird_info(self):
        try:
            bird_names = os.listdir('./data/birds audio/')
            for bird_name in bird_names:
                url = BirdScraper.__get_url_wikipedia(bird_name)
                response = Selector(requests.get(url))
                bird_info = self.__scrap_wiki_data(response)
        except:
            raise FileNotFoundError


In [85]:
objt = BirdScraper()
objt.get_wiki_bird_info()

CONTROL 1
http://en.wikipedia.org/wiki/Abert%27s_towhee
['Conservation status', 'Least Concern', 'IUCN 3.1', '[1]', 'Scientific classification', 'Eukaryota', 'Animalia', 'Chordata', 'Aves', 'Passeriformes', 'Passerellidae', 'Binomial name', 'Baird', 'Synonyms']
Least Concern
http://en.wikipedia.org/wiki/Acadian_flycatcher
['ⓘ', 'Conservation status', 'Least Concern', 'IUCN 3.1', '[1]', 'Scientific classification', 'Eukaryota', 'Animalia', 'Chordata', 'Aves', 'Passeriformes', 'Tyrannidae', 'Binomial name', 'Vieillot', 'Synonyms', '[2]']
Least Concern
http://en.wikipedia.org/wiki/Acorn_woodpecker
['Conservation status', 'Least Concern', 'IUCN 3.1', '[1]', 'Scientific classification', 'Eukaryota', 'Animalia', 'Chordata', 'Aves', 'Piciformes', 'Picidae', 'Binomial name', 'Swainson']
Least Concern
http://en.wikipedia.org/wiki/Alder_flycatcher
['Conservation status', 'Least Concern', 'IUCN 3.1', '[1]', 'Scientific classification', 'Eukaryota', 'Animalia', 'Chordata', 'Aves', 'Passeriformes',

In [None]:
response = Selector(response, type='html')

In [None]:
response.xpath('//div[@class="bird-outer"]/a/@href').getall()

['aberts-towhee/',
 'acadian-flycatcher/',
 'acorn-woodpecker/',
 'alder-flycatcher/',
 'allens-hummingbird/',
 'american-avocet/',
 'american-bittern/',
 'american-black-duck/',
 'american-coot/',
 'american-crow/',
 'american-dipper/',
 'american-golden-plover/',
 'american-goldfinch/',
 'american-kestrel/',
 'american-oystercatcher/',
 'american-pipit/',
 'american-redstart/',
 'american-robin/',
 'american-three-toed-woodpecker/',
 'american-tree-sparrow/',
 'american-white-ibis/',
 'american-wigeon/',
 'american-woodcock/',
 'ancient-murrelet/',
 'anhinga/',
 'annas-hummingbird/',
 'antillean-nighthawk/',
 'arctic-redpoll/',
 'arctic-tern/',
 'arizona-woodpecker/',
 'ash-throated-flycatcher/',
 'atlantic-puffin/',
 'bachmans-sparrow/',
 'bairds-sandpiper/',
 'bairds-sparrow/',
 'bald-eagle/',
 'baltimore-oriole/',
 'band-tailed-pigeon/',
 'bank-swallow/',
 'bare-throated-tiger-heron/',
 'barn-owl/',
 'barn-swallow/',
 'barnacle-goose/',
 'barred-owl/',
 'barrows-goldeneye/',
 'bay