In [4]:
import wikipedia
import random
from IPython.display import clear_output
import pickle
wikipedia.set_lang("en")
import numpy as np
import spacy

# Analiza gatunków ptaków świata
Gromadzenie danych: wykorzystujemy Wikipedia API

In [8]:
categories = ["List_of_birds_of_Africa", "List_of_birds_of_Europe", 
              "Birds_of_the_United_States", "List_of_birds_of_Asia",
              "List_of_birds_of_South_America"]

In [9]:
all_birds = list()
for category in categories:
    all_birds += wikipedia.page(category).links
    
random.shuffle(all_birds)

In [10]:
considered_birds = all_birds[0:1000]

In [11]:
bird_pages = list()
for i in range(1000):
    try:
        bird_pages.append(wikipedia.page(considered_birds[i]))
    except:
        pass
    clear_output(wait=True)
    print(str(i) + "of" + "1000")

999of1000


# Serializacja pobranych stron
Pobrane strony przechowujemy jako listę wikipedia.page.
Typ ten zapewnia nam dostęp do:
- linków wychodzących: page.links
- linków do zdjęć: page.images
- zawartości strony: page.content
- podsumowania: page.summary

In [12]:
pages_file = "pages.bin"
with open(pages_file, 'wb') as file:
    pickle.dump(bird_pages, file)

# Początek analizy

In [5]:
pages_file = "pages.bin"
pages = None
with open(pages_file, 'rb') as file:
    pages = pickle.load(file)

In [6]:
print(pages[1:10])

[<WikipediaPage 'Grey-naped antpitta'>, <WikipediaPage 'Shelley's greenbul'>, <WikipediaPage 'Sumatran ground cuckoo'>, <WikipediaPage 'Sharp-billed treehunter'>, <WikipediaPage 'Yellow-throated longclaw'>, <WikipediaPage 'Rufous-rumped seedeater'>, <WikipediaPage 'Tree martin'>, <WikipediaPage 'Acorn woodpecker'>, <WikipediaPage 'Little curlew'>]


In [7]:
sample_page = pages[np.random.randint(0, len(pages))]
sample_page.title

'Potoo'

In [8]:
text_data = sample_page.content
print(text_data)

Potoos (family Nyctibiidae) are a group of near passerine birds related to the nightjars and frogmouths. They are sometimes called poor-me-ones, after their haunting calls. There are seven species in one genus, Nyctibius, in tropical Central and South America.
These are nocturnal insectivores which lack the bristles around the mouth found in the true nightjars. They hunt from a perch like a shrike or flycatcher. During the day they perch upright on tree stumps, camouflaged to look like part of the stump. The single spotted egg is laid directly on the top of a stump.


== Evolution and taxonomy ==

The potoos are today an exclusively New World family, but they apparently had a much more widespread distribution in the past. Fossil remains of potoos dating from the Oligocene and Eocene have been found in France and Germany. A complete skeleton of the genus Paraprefica has been found in Messel, Germany. It had skull and leg features similar to those of modern potoos, suggesting that it may

Do ekstrakcji informacji geograficznych wykorzystujemy bibliotekę spacy.
Należy pobrać model językowy komendą: sudo python3 -m spacy download en_core_web_sm

In [9]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(text_data)

In [10]:
for ent in doc.ents[:100]:
    if ent.label_ == 'LOC':
        print(ent.text, ent.start_char, ent.end_char, ent.label_)

Central and South America 234 259 LOC
Australasia 2519 2530 LOC
the Amazon Basin 3786 3802 LOC
Central and South American 3854 3880 LOC
Caribbean 3928 3937 LOC


Wyjaśnienie skrótów encji nazwanych znajduje się na stronie:
https://spacy.io/api/annotation#named-entities

### Poszukiwanie państwa w którym żyje najwięcej ptaków

In [11]:
import progressbar
from geotext import GeoText
class Container:
    def __init__(self, page):
        self.page = page

In [12]:
c = [Container(p) for p in pages[:200]]

In [13]:
c[0].page.content

'The white-capped dipper (Cinclus leucocephalus) is an aquatic songbird found in South America. It is a small black bird with white spots. It is found in Bolivia, Colombia, Ecuador, Peru and Venezuela.\n\n\n== References ==\n\nArteaga K. Nino.(2008). Son astutos y dificilmente caen en las redes de neblina, generalmente van en parejas en busca de alimento encontrado an orillas de los ríos. De estologia curiosa, prefieren ver al observador.\n\n\n== External links ==\n"Cinclus". Integrated Taxonomic Information System. \nBird Forum page on White-capped Dipper (including photo)'

In [14]:
GeoText(c[0].page.content).countries

['Bolivia', 'Colombia', 'Ecuador', 'Peru', 'Venezuela']

In [15]:
bar = progressbar.ProgressBar(max_value=len(c)).start()
for i in range(len(c)):
    c[i].countries = GeoText(c[i].page.content).countries
    bar.update(i+1)
bar.finish()

100% (200 of 200) |######################| Elapsed Time: 0:01:27 Time:  0:01:27


In [16]:
from collections import Counter

In [17]:
c0 = Counter(c[1].countries)
c0

Counter({'Venezuela': 1})

In [18]:
c1 = Counter(c[5].countries)
c1

Counter({'Angola': 1,
         'Benin': 1,
         'Burkina Faso': 1,
         'Burundi': 1,
         'Cameroon': 1,
         'Central African Republic': 1,
         'Chad': 1,
         'Gabon': 1,
         'Gambia': 1,
         'Ghana': 1,
         'Guinea': 1,
         'Guinea-Bissau': 1,
         'Ivory Coast': 1,
         'Kenya': 1,
         'Lesotho': 1,
         'Liberia': 1,
         'Malawi': 1,
         'Mali': 1,
         'Mozambique': 1,
         'Niger': 1,
         'Nigeria': 1,
         'Rwanda': 1,
         'Senegal': 1,
         'Sierra Leone': 1,
         'Somalia': 1,
         'South Africa': 1,
         'South Sudan': 1,
         'Swaziland': 1,
         'Tanzania': 1,
         'Togo': 1,
         'Uganda': 1,
         'Zambia': 1,
         'Zimbabwe': 1})

In [19]:
c0+c1

Counter({'Angola': 1,
         'Benin': 1,
         'Burkina Faso': 1,
         'Burundi': 1,
         'Cameroon': 1,
         'Central African Republic': 1,
         'Chad': 1,
         'Gabon': 1,
         'Gambia': 1,
         'Ghana': 1,
         'Guinea': 1,
         'Guinea-Bissau': 1,
         'Ivory Coast': 1,
         'Kenya': 1,
         'Lesotho': 1,
         'Liberia': 1,
         'Malawi': 1,
         'Mali': 1,
         'Mozambique': 1,
         'Niger': 1,
         'Nigeria': 1,
         'Rwanda': 1,
         'Senegal': 1,
         'Sierra Leone': 1,
         'Somalia': 1,
         'South Africa': 1,
         'South Sudan': 1,
         'Swaziland': 1,
         'Tanzania': 1,
         'Togo': 1,
         'Uganda': 1,
         'Venezuela': 1,
         'Zambia': 1,
         'Zimbabwe': 1})

In [20]:
country_counter = Counter()
for i in range(len(c)):
    country_counter += Counter(c[i].countries)
country_counter.most_common()

[('Georgia', 66),
 ('India', 57),
 ('Russia', 49),
 ('Brazil', 37),
 ('New Zealand', 34),
 ('Australia', 34),
 ('United States', 33),
 ('Peru', 32),
 ('Canada', 31),
 ('Colombia', 31),
 ('China', 30),
 ('Venezuela', 29),
 ('Tanzania', 29),
 ('Ecuador', 26),
 ('Argentina', 26),
 ('Bolivia', 24),
 ('Japan', 24),
 ('Kenya', 24),
 ('South Africa', 22),
 ('Mexico', 22),
 ('Philippines', 20),
 ('Nigeria', 20),
 ('Chile', 20),
 ('Angola', 20),
 ('Uganda', 18),
 ('Turkey', 17),
 ('Iran', 15),
 ('Ethiopia', 15),
 ('Spain', 14),
 ('Indonesia', 14),
 ('Germany', 14),
 ('Mozambique', 14),
 ('Bangladesh', 13),
 ('Zambia', 13),
 ('France', 13),
 ('Sri Lanka', 12),
 ('Cameroon', 12),
 ('Thailand', 11),
 ('Rwanda', 11),
 ('Nepal', 11),
 ('Malawi', 11),
 ('Namibia', 11),
 ('Guinea', 10),
 ('Guyana', 10),
 ('French Guiana', 10),
 ('Costa Rica', 10),
 ('Israel', 10),
 ('Senegal', 9),
 ('Mongolia', 9),
 ('Gabon', 9),
 ('Falkland Islands', 9),
 ('Sweden', 9),
 ('Greece', 9),
 ('Panama', 9),
 ('Madagascar',

In [26]:
import urllib
import simplejson
import folium
import time

googleGeocodeUrl = 'http://maps.googleapis.com/maps/api/geocode/json?'

def get_coordinates(query, from_sensor=False):
    query = query.encode('utf-8')
    params = {
        'address': query,
        'sensor': "true" if from_sensor else "false"
    }
    url = googleGeocodeUrl + urllib.parse.urlencode(params)
    json_response = urllib.request.urlopen(url)
    response = simplejson.loads(json_response.read())
    if response['results']:
        location = response['results'][0]['geometry']['location']
        latitude, longitude = location['lat'], location['lng']
    else:
        latitude, longitude = None, None
    return latitude, longitude

In [29]:
locations = [i[0] for i in country_counter.most_common(15)]
most_coords = []
for loc in locations:
    la,lo = get_coordinates(loc, from_sensor=True)
    time.sleep(0.1)
    if la != None:
        most_coords.append((la,lo))
print(most_coords)
print(len(most_coords))

[(32.1656221, -82.9000751), (20.593684, 78.96288), (61.52401, 105.318756), (-14.235004, -51.92528), (-40.900557, 174.885971), (-25.274398, 133.775136), (37.09024, -95.712891), (-9.189967, -75.015152), (56.130366, -106.346771), (4.570868, -74.297333), (35.86166, 104.195397), (6.42375, -66.58973), (-6.369028, 34.888822), (-1.831239, -78.18340599999999), (-38.416097, -63.61667199999999)]
15


In [30]:
START = [0, 0]
coords =   most_coords

map_animals = folium.Map(location=START, zoom_start=0, tiles='cartodbpositron', width=640, height=480)

[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_animals) for i in range(len(coords))]

map_animals

### Grupowanie ptaków na podstawie siedlisk 

Podobieństwo między dwoma gatunkami ptaków liczymy z poniższego wzoru (iloczyn lokalizacji w których występują przez ich sumę).
$$ sim(x^i,x^j)=\frac{|(x_k^i,x_k^j):x_k^i = x_k^j = 1|}{|(x_k^i,x_k^j):x_k^i \lor x_k^j = 1)|}$$

In [None]:
def similarity(b1,b2):
    c1,c2 = Counter(b1.countries), Counter(b2.countries)
    locations_sum = len(c1 + c2)
    locations_intersection = len(c1 + c2)
    if locations_sum > 0:
        return locations_intersection/locations_sum
    else:
        return 0

Za podobieństwo pomiędzy grupami przyjmujemy minimalne podobieństwo między dwoma gatunkami należącymi do oddzielnych grup.
$$ sim(G^I,G^M)=min\{sim(x^i,x^j):x^i \in G^I,x^j \in G^M \}$$

In [None]:
def groupSimilarity(g1, g2):
    minSimilarity = 1
    
    for g1_ele in g1: 
        for g2_ele in g2:
            tmp = similarity(g1_ele,g2_ele)
            if tmp < minSimilarity:
                minSimilarity = tmp
            if minSimilarity == 0:
                return minSimilarity
            
    return minSimilarity

W następnym kroku sprawdzamy czy istnieją grupy podobne do siebie, jeśli tak te o największym podobieństwie i łączymy je ze sobą.
$$ G^I,G^M=argmax\{sim(G^I,G^M):G^I \in G,G^M \in G, G^I\neq G^M \}$$

$$sim(G^I,G^M) > 0$$

In [None]:
def similarities(groups):
    maxGroupSimilarity, x, y = 0, 0, 0
    
    for g1_ele in groups: 
        for g2_ele in groups:
            if g1_ele != g2_ele:
                tmp = groupSimilarity(g1_ele, g2_ele)
                if maxGroupSimilarity < tmp: 
                    maxGroupSimilarity, x, y = tmp, g1_ele, g2_ele
                    
    if maxGroupSimilarity > 0:
        groups.remove(x)
        groups.remove(y)
        groups.append(x + y)
        groups = similarities(groups)
        
    return groups

groups = []
for group in c[0:75]:
    groups.append([group])
groups = similarities(groups)
for group in groups:
    locations = Counter()
    print("*****GRUPA*****")
    print("Zwierzęta:")
    for element in group:
        print(element.page.title)
        locations += Counter(element.countries)
    print("***Kraje:***")
    print(locations.most_common(10))

In [None]:
locations = []
for group in groups:
    loc = Counter()
    for element in group:
        loc += Counter(element.countries)
    locations.append([i[0] for i in loc.most_common(10)])
    print(locations[-1])

In [None]:
import time

group_coords = []
for loc in locations:
    coords = []
    for element in loc:
        la,lo = get_coordinates(element, from_sensor=True)
        time.sleep(0.1)
        if la != None:
            coords.append((la,lo))
    group_coords.append(coords)
    print(coords)

In [None]:
START = [0, 0]
coords =  group_coords[6]

map_animals = folium.Map(location=START, zoom_start=0, tiles='cartodbpositron', width=640, height=480)

[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_animals) for i in range(len(coords))]

map_animals

In [None]:
START = [0, 0]
coords =   group_coords[2]

map_animals = folium.Map(location=START, zoom_start=0, tiles='cartodbpositron', width=640, height=480)

[folium.CircleMarker(coords[i], radius=1,
                color='#0080bb', fill_color='#0080bb').add_to(map_animals) for i in range(len(coords))]

map_animals