In [1]:
import wikipedia
import random
from IPython.display import clear_output
import pickle
wikipedia.set_lang("en")
import numpy as np
import spacy

# Analiza gatunków ptaków świata
Gromadzenie danych: wykorzystujemy Wikipedia API

In [2]:
categories = ["List_of_birds_of_Africa", "List_of_birds_of_Europe", 
              "Birds_of_the_United_States", "List_of_birds_of_Asia",
              "List_of_birds_of_South_America"]

In [None]:
all_birds = list()
for category in categories:
    all_birds += wikipedia.page(category).links
    
random.shuffle(all_birds)

In [None]:
considered_birds = all_birds[0:1000]

In [None]:
bird_pages = list()
for i in range(1000):
    try:
        bird_pages.append(wikipedia.page(considered_birds[i]))
    except:
        pass
    clear_output(wait=True)
    print(str(i) + "of" + "1000")

# Serializacja pobranych stron
Pobrane strony przechowujemy jako listę wikipedia.page.
Typ ten zapewnia nam dostęp do:
- linków wychodzących: page.links
- linków do zdjęć: page.images
- zawartości strony: page.content
- podsumowania: page.summary

In [None]:
pages_file = "pages.bin"
with open(pages_file, 'wb') as file:
    pickle.dump(bird_pages, file)

# Początek analizy

In [4]:
pages_file = "pages.bin"
pages = None
with open(pages_file, 'rb') as file:
    pages = pickle.load(file)

In [5]:
print(pages[1:10])

[<WikipediaPage 'Blackish rail'>, <WikipediaPage 'Creamy-bellied thrush'>, <WikipediaPage 'Rusty-faced parrot'>, <WikipediaPage 'Bahia spinetail'>, <WikipediaPage 'Sickle-winged nightjar'>, <WikipediaPage 'Andean teal'>, <WikipediaPage 'Guaiabero'>, <WikipediaPage 'Buff-banded rail'>, <WikipediaPage 'Pallas's fish eagle'>]


In [6]:
sample_page = pages[np.random.randint(0, len(pages))]
sample_page.title

"Todd's sirystes"

In [7]:
text_data = sample_page.content
print(text_data)

Todd's sirystes (Sirystes subcanescens) is a species of bird in the family Tyrannidae. It was formerly considered conspecific with the sibilant sirystes.


== Distribution and habitat ==
It is found from Guyana throughout northeastern Amazonian Brazil. Its natural habitat is subtropical or tropical moist lowland forests.


== References ==

Donegan, T.M. 2013b. Vocal variation and species limits in the genus Sirystes (Tyrannidae). Conservacion Colombiana 19: 11-30.
South American Classification Committee (May 1, 2014). "Proposal (#610) to South American Classification Committee – Split Sirystes into two (A) or four (B) species". Retrieved August 9, 2014.


Do ekstrakcji informacji geograficznych wykorzystujemy bibliotekę spacy.
Należy pobrać model językowy komendą: sudo python3 -m spacy download en_core_web_sm

In [8]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text_data)

In [11]:
for ent in doc.ents[:100]:
    if ent.label_ == 'LOC':
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Wyjaśnienie skrótów encji nazwanych znajduje się na stronie:
https://spacy.io/api/annotation#named-entities

### Poszukiwanie państwa w którym żyje najwięcej ptaków

In [29]:
import progressbar
from geotext import GeoText
class Container:
    def __init__(self, page):
        self.page = page

In [34]:
c = [Container(p) for p in pages[:200]]

In [38]:
c[0].page.content

"The spotted elachura or spotted wren-babbler (Elachura formosa) is a species of passerine bird found in the forests of the eastern Himalayas and Southeast Asia. In the past it was included in the babbler genus Spelaeornis as S. formosus, but molecular phylogenetic studies in 2014 provided evidence that it was distinct from the babblers and part of a basal lineage (one that diverged early) with no other close living relatives within the passerine bird clade Passerida. This led to the creation of a new family, Elachuridae, to accommodate just one species (a monotypic taxon).\n\n\n== Description ==\nThe spotted elachura measures 10 cm including its short tail. It is brown above and white below. It is dark brown all over, with rufous wings and tail. It also has white speckles all over its body, shifting to black barring on its wings and tail.\n\n\n== Habitat and distribution ==\nIt is found in Bangladesh, Bhutan, China, India, Laos, Myanmar, Nepal, and Vietnam. Its natural habitat is subt

In [40]:
GeoText(c[0].page.content).countries

['Bangladesh',
 'Bhutan',
 'China',
 'India',
 'Laos',
 'Myanmar',
 'Nepal',
 'Vietnam']

In [35]:
bar = progressbar.ProgressBar(max_value=len(c)).start()
for i in range(len(c)):
    c[i].countries = GeoText(c[i].page.content).countries
    bar.update(i+1)
bar.finish()

100% (200 of 200) |######################| Elapsed Time: 0:00:08 Time:  0:00:08


In [36]:
from collections import Counter

In [50]:
c0 = Counter(c[1].countries)
c0

Counter({'Argentina': 1,
         'Bolivia': 1,
         'Brazil': 3,
         'Colombia': 2,
         'Paraguay': 1,
         'Peru': 1,
         'Venezuela': 1})

In [51]:
c1 = Counter(c[5].countries)
c1

Counter({'Argentina': 1, 'Brazil': 1, 'Paraguay': 1, 'Uruguay': 1})

In [52]:
c0+c1

Counter({'Argentina': 2,
         'Bolivia': 1,
         'Brazil': 4,
         'Colombia': 2,
         'Paraguay': 2,
         'Peru': 1,
         'Uruguay': 1,
         'Venezuela': 1})

In [54]:
country_counter = Counter()
for i in range(len(c)):
    country_counter += Counter(c[i].countries)
country_counter.most_common()

[('French Guiana', 168),
 ('Brazil', 154),
 ('Argentina', 94),
 ('Peru', 73),
 ('Colombia', 69),
 ('Venezuela', 63),
 ('Bolivia', 58),
 ('India', 54),
 ('Brunei', 50),
 ('Ecuador', 42),
 ('Chile', 41),
 ('France', 39),
 ('Paraguay', 36),
 ('China', 35),
 ('Mexico', 33),
 ('Uruguay', 30),
 ('South Africa', 24),
 ('Canada', 24),
 ('Suriname', 23),
 ('Thailand', 22),
 ('Panama', 22),
 ('Australia', 22),
 ('Angola', 21),
 ('Spain', 20),
 ('United States', 20),
 ('Tanzania', 20),
 ('Guyana', 20),
 ('Kenya', 19),
 ('Indonesia', 18),
 ('Somalia', 16),
 ('Russia', 16),
 ('Botswana', 16),
 ('Pakistan', 15),
 ('Portugal', 15),
 ('Uganda', 15),
 ('Malaysia', 14),
 ('Ethiopia', 14),
 ('Ireland', 14),
 ('Namibia', 14),
 ('Japan', 13),
 ('Germany', 13),
 ('Mozambique', 13),
 ('Zimbabwe', 13),
 ('Turkey', 13),
 ('Zambia', 13),
 ('Iceland', 13),
 ('New Zealand', 12),
 ('Sudan', 11),
 ('Malawi', 11),
 ('Iran', 11),
 ('Nepal', 10),
 ('Vietnam', 10),
 ('Bhutan', 10),
 ('Greenland', 10),
 ('Myanmar', 10),