In [7]:
import wikipedia
import random
from IPython.display import clear_output
import pickle
wikipedia.set_lang("en")
import numpy as np
import spacy

# Analiza gatunków ptaków świata
Gromadzenie danych: wykorzystujemy Wikipedia API

In [8]:
categories = ["List_of_birds_of_Africa", "List_of_birds_of_Europe", 
              "Birds_of_the_United_States", "List_of_birds_of_Asia",
              "List_of_birds_of_South_America"]

In [9]:
all_birds = list()
for category in categories:
    all_birds += wikipedia.page(category).links
    
random.shuffle(all_birds)

In [10]:
considered_birds = all_birds[0:1000]

In [11]:
bird_pages = list()
for i in range(1000):
    try:
        bird_pages.append(wikipedia.page(considered_birds[i]))
    except:
        pass
    clear_output(wait=True)
    print(str(i) + "of" + "1000")

999of1000


# Serializacja pobranych stron
Pobrane strony przechowujemy jako listę wikipedia.page.
Typ ten zapewnia nam dostęp do:
- linków wychodzących: page.links
- linków do zdjęć: page.images
- zawartości strony: page.content
- podsumowania: page.summary

In [12]:
pages_file = "pages.bin"
with open(pages_file, 'wb') as file:
    pickle.dump(bird_pages, file)

# Początek analizy

In [13]:
pages_file = "pages.bin"
pages = None
with open(pages_file, 'rb') as file:
    pages = pickle.load(file)

In [14]:
print(pages[1:10])

[<WikipediaPage 'Grey-naped antpitta'>, <WikipediaPage 'Shelley's greenbul'>, <WikipediaPage 'Sumatran ground cuckoo'>, <WikipediaPage 'Sharp-billed treehunter'>, <WikipediaPage 'Yellow-throated longclaw'>, <WikipediaPage 'Rufous-rumped seedeater'>, <WikipediaPage 'Tree martin'>, <WikipediaPage 'Acorn woodpecker'>, <WikipediaPage 'Little curlew'>]


In [15]:
sample_page = pages[np.random.randint(0, len(pages))]
sample_page.title

'White-capped albatross'

In [16]:
text_data = sample_page.content
print(text_data)

The white-capped albatross (Thalassarche cauta steadi) is a mollymawk that breeds on the islands off of New Zealand. Not all experts agree that this form should be recognized as a separate species from the shy albatross, Thalassarche cauta. It is a medium-sized black, slate gray, and white albatross and is the largest of the mollymawks.


== Taxonomy ==
Mollymawks are a type of albatross that belong to Diomedeidae family and come from the Procellariiformes order, along with shearwaters, fulmars, storm petrels, and diving petrels. They share certain identifying features. First, they have nasal passages that attach to the upper bill called naricorns. Although the nostrils on the albatross are on the sides of the bill. The bills of Procellariiformes are also unique in that they are split into between seven and nine horny plates. Finally, they produce a stomach oil made up of wax esters and triglycerides that is stored in the proventriculus. This is used against predators as well as an ene

Do ekstrakcji informacji geograficznych wykorzystujemy bibliotekę spacy.
Należy pobrać model językowy komendą: sudo python3 -m spacy download en_core_web_sm

In [17]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text_data)

In [18]:
for ent in doc.ents[:100]:
    if ent.label_ == 'LOC':
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Bollons Island 2827 2841 LOC
Islands 2853 2860 LOC
Atlantic 2952 2960 LOC
South 2988 2993 LOC
Africa 3125 3131 LOC
the south Atlantic 3177 3195 LOC
Indian Ocean 3217 3229 LOC
Bass Strait 3871 3882 LOC
Auckland Island 3947 3962 LOC


Wyjaśnienie skrótów encji nazwanych znajduje się na stronie:
https://spacy.io/api/annotation#named-entities

### Poszukiwanie państwa w którym żyje najwięcej ptaków

In [19]:
import progressbar
from geotext import GeoText
class Container:
    def __init__(self, page):
        self.page = page

In [20]:
c = [Container(p) for p in pages[:200]]

In [21]:
c[0].page.content

'The white-capped dipper (Cinclus leucocephalus) is an aquatic songbird found in South America. It is a small black bird with white spots. It is found in Bolivia, Colombia, Ecuador, Peru and Venezuela.\n\n\n== References ==\n\nArteaga K. Nino.(2008). Son astutos y dificilmente caen en las redes de neblina, generalmente van en parejas en busca de alimento encontrado an orillas de los ríos. De estologia curiosa, prefieren ver al observador.\n\n\n== External links ==\n"Cinclus". Integrated Taxonomic Information System. \nBird Forum page on White-capped Dipper (including photo)'

In [22]:
GeoText(c[0].page.content).countries

['Bolivia', 'Colombia', 'Ecuador', 'Peru', 'Venezuela']

In [23]:
bar = progressbar.ProgressBar(max_value=len(c)).start()
for i in range(len(c)):
    c[i].countries = GeoText(c[i].page.content).countries
    bar.update(i+1)
bar.finish()

100% (200 of 200) |######################| Elapsed Time: 0:01:18 Time:  0:01:18


In [24]:
from collections import Counter

In [25]:
c0 = Counter(c[1].countries)
c0

Counter({'Venezuela': 1})

In [26]:
c1 = Counter(c[5].countries)
c1

Counter({'Angola': 1,
         'Benin': 1,
         'Burkina Faso': 1,
         'Burundi': 1,
         'Cameroon': 1,
         'Central African Republic': 1,
         'Chad': 1,
         'Gabon': 1,
         'Gambia': 1,
         'Ghana': 1,
         'Guinea': 1,
         'Guinea-Bissau': 1,
         'Ivory Coast': 1,
         'Kenya': 1,
         'Lesotho': 1,
         'Liberia': 1,
         'Malawi': 1,
         'Mali': 1,
         'Mozambique': 1,
         'Niger': 1,
         'Nigeria': 1,
         'Rwanda': 1,
         'Senegal': 1,
         'Sierra Leone': 1,
         'Somalia': 1,
         'South Africa': 1,
         'South Sudan': 1,
         'Swaziland': 1,
         'Tanzania': 1,
         'Togo': 1,
         'Uganda': 1,
         'Zambia': 1,
         'Zimbabwe': 1})

In [27]:
c0+c1

Counter({'Angola': 1,
         'Benin': 1,
         'Burkina Faso': 1,
         'Burundi': 1,
         'Cameroon': 1,
         'Central African Republic': 1,
         'Chad': 1,
         'Gabon': 1,
         'Gambia': 1,
         'Ghana': 1,
         'Guinea': 1,
         'Guinea-Bissau': 1,
         'Ivory Coast': 1,
         'Kenya': 1,
         'Lesotho': 1,
         'Liberia': 1,
         'Malawi': 1,
         'Mali': 1,
         'Mozambique': 1,
         'Niger': 1,
         'Nigeria': 1,
         'Rwanda': 1,
         'Senegal': 1,
         'Sierra Leone': 1,
         'Somalia': 1,
         'South Africa': 1,
         'South Sudan': 1,
         'Swaziland': 1,
         'Tanzania': 1,
         'Togo': 1,
         'Uganda': 1,
         'Venezuela': 1,
         'Zambia': 1,
         'Zimbabwe': 1})

In [28]:
country_counter = Counter()
for i in range(len(c)):
    country_counter += Counter(c[i].countries)
country_counter.most_common()

[('Georgia', 66),
 ('India', 57),
 ('Russia', 49),
 ('Brazil', 37),
 ('Australia', 34),
 ('New Zealand', 34),
 ('United States', 33),
 ('Peru', 32),
 ('Canada', 31),
 ('Colombia', 31),
 ('China', 30),
 ('Venezuela', 29),
 ('Tanzania', 29),
 ('Argentina', 26),
 ('Ecuador', 26),
 ('Japan', 24),
 ('Kenya', 24),
 ('Bolivia', 24),
 ('Mexico', 22),
 ('South Africa', 22),
 ('Angola', 20),
 ('Nigeria', 20),
 ('Philippines', 20),
 ('Chile', 20),
 ('Uganda', 18),
 ('Turkey', 17),
 ('Iran', 15),
 ('Ethiopia', 15),
 ('Spain', 14),
 ('Germany', 14),
 ('Indonesia', 14),
 ('Mozambique', 14),
 ('France', 13),
 ('Zambia', 13),
 ('Bangladesh', 13),
 ('Cameroon', 12),
 ('Sri Lanka', 12),
 ('Rwanda', 11),
 ('Malawi', 11),
 ('Namibia', 11),
 ('Thailand', 11),
 ('Nepal', 11),
 ('Israel', 10),
 ('Guinea', 10),
 ('Costa Rica', 10),
 ('Guyana', 10),
 ('French Guiana', 10),
 ('Senegal', 9),
 ('Gabon', 9),
 ('Mongolia', 9),
 ('Falkland Islands', 9),
 ('Panama', 9),
 ('Greece', 9),
 ('Sweden', 9),
 ('Pakistan', 8

### Grupowanie ptaków na podstawie siedlisk 

Podobieństwo między dwoma gatunkami ptaków liczymy z poniższego wzoru (iloczyn lokalizacji w których występują przez ich sumę).
$$ sim(x^i,x^j)=\frac{|(x_k^i,x_k^j):x_k^i = x_k^j = 1|}{|(x_k^i,x_k^j):x_k^i \lor x_k^j = 1)|}$$

In [29]:
def similarity(b1,b2):
    c1,c2 = Counter(b1.countries), Counter(b2.countries)
    locations_sum = len(c1 + c2)
    locations_intersection = len(c1 + c2)
    if locations_sum > 0:
        return locations_intersection/locations_sum
    else:
        return 0

Za podobieństwo pomiędzy grupami przyjmujemy minimalne podobieństwo między dwoma gatunkami należącymi do oddzielnych grup.
$$ sim(G^I,G^M)=min\{sim(x^i,x^j):x^i \in G^I,x^j \in G^M \}$$

In [35]:
def groupSimilarity(g1, g2):
    minSimilarity = 1
    
    for g1_ele in g1: 
        for g2_ele in g2:
            tmp = similarity(g1_ele,g2_ele)
            if tmp < minSimilarity:
                minSimilarity = tmp
            if minSimilarity == 0:
                return minSimilarity
            
    return minSimilarity

W następnym kroku sprawdzamy czy istnieją grupy podobne do siebie, jeśli tak te o największym podobieństwie i łączymy je ze sobą.
$$ G^I,G^M=argmax\{sim(G^I,G^M):G^I,G^M \in G, G^I\neq G^M \}$$

$$sim(G^I,G^M) > 0$$

In [31]:
def similarities(groups):
    maxGroupSimilarity, x, y = 0, 0, 0
    
    for g1_ele in groups: 
        for g2_ele in groups:
            if g1_ele != g2_ele:
                tmp = groupSimilarity(g1_ele, g2_ele)
                if maxGroupSimilarity < tmp: 
                    maxGroupSimilarity, x, y = tmp, g1_ele, g2_ele
                    
    if maxGroupSimilarity > 0:
        groups.remove(x)
        groups.remove(y)
        groups.append(x + y)
        groups = similarities(groups)
        
    return groups

groups = []
for group in c[0:200]:
    groups.append([group])
groups = similarities(groups)
for group in groups:
    locations = Counter()
    print("*****GRUPA*****")
    print("Zwierzęta:")
    for element in group:
        print(element.page.title)
        locations += Counter(element.countries)
    print("***Kraje:***")
    print(locations.most_common(10))

*****GRUPA*****
Zwierzęta:
Indian cormorant
Frigatebird
Northern crombec
Eastern bearded greenbul
Violet-backed starling
Orange-billed nightingale-thrush
Painted bush quail
Cackling goose
***Kraje:***
[('Canada', 14), ('India', 5), ('Christmas Island', 4), ('Cambodia', 2), ('Central African Republic', 2), ('Thailand', 2), ('Japan', 2), ('French Guiana', 2), ('Sri Lanka', 2), ('Mexico', 2)]
*****GRUPA*****
Zwierzęta:
Red-crested pochard
Red-billed leiothrix
Bolivian white-crowned tapaculo
Many-striped canastero
Cinnamon-breasted warbler
Collared lark
Red-capped forest warbler
Large blue flycatcher
***Kraje:***
[('Peru', 5), ('Colombia', 4), ('Spain', 2), ('Nepal', 2), ('Japan', 2), ('Ecuador', 2), ('India', 1), ('Bolivia', 1), ('Tanzania', 1), ('Namibia', 1)]
*****GRUPA*****
Zwierzęta:
Hudson's canastero
D'Arnaud's barbet
Short-toed rock thrush
White-sided flowerpiercer
Serra do Mar tyrannulet
Jackson's widowbird
Plain-crested elaenia
Mountain imperial pigeon
***Kraje:***
[('Brazil', 5)

In [43]:
locations = []
for group in groups[0:2]:
    loc = Counter()
    for element in group:
        loc += Counter(element.countries)
    locations.append([i[0] for i in loc.most_common(10)])
    print(locations[-1])

['Canada', 'India', 'Christmas Island', 'Cambodia', 'Central African Republic', 'Thailand', 'Japan', 'French Guiana', 'Sri Lanka', 'Mexico']
['Peru', 'Colombia', 'Spain', 'Nepal', 'Japan', 'Ecuador', 'India', 'Bolivia', 'Tanzania', 'Namibia']


In [32]:
import urllib
import simplejson
import folium

googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'

def get_coordinates(query, from_sensor=False):
    query = query.encode('utf-8')
    params = {
        'address': query,
        'sensor': "true" if from_sensor else "false"
    }
    url = googleGeocodeUrl + urllib.parse.urlencode(params)
    json_response = urllib.request.urlopen(url)
    response = simplejson.loads(json_response.read())
    if response['results']:
        location = response['results'][0]['geometry']['location']
        latitude, longitude = location['lat'], location['lng']
    else:
        latitude, longitude = None, None
    return latitude, longitude

In [49]:
group_coords = []
for loc in locations:
    coords = []
    for element in loc:
        la,lo = get_coordinates(element, from_sensor=False)
        if la != None:
            coords.append((la,lo))
    group_coords.append(coords)
    print(coords)

[(56.130366, -106.346771), (20.593684, 78.96288), (-10.447525, 105.690449), (12.565679, 104.990963), (6.611110999999999, 20.939444), (15.870032, 100.992541), (36.204824, 138.252924), (3.933889, -53.125782), (7.873053999999999, 80.77179699999999), (23.634501, -102.552784)]
[(40.46366700000001, -3.74922), (28.394857, 84.12400799999999), (36.204824, 138.252924), (-1.831239, -78.18340599999999), (-16.290154, -63.58865299999999), (-6.369028, 34.888822), (-22.95764, 18.49041)]


In [50]:
START = [0, 0]
coords =  group_coords[0]

map_animals = folium.Map(location=START, zoom_start=0, tiles='cartodbpositron', width=640, height=480)

[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_animals) for i in range(len(coords))]

map_animals

In [51]:
START = [0, 0]
coords =   group_coords[1]

map_animals = folium.Map(location=START, zoom_start=0, tiles='cartodbpositron', width=640, height=480)

[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_animals) for i in range(len(coords))]

map_animals