# Улицы

## выкачивание данных

In [133]:
import overpy
op = overpy.Overpass()

In [134]:
# http://gis.stackexchange.com/questions/178424/overpass-turbo-area-code-lookup
AREA = "3600060189"  # Russia

In [None]:
r = op.query('''
area({})->.searchArea;
(
    way
    ["name"]["highway"]
    (area.searchArea);
);
out;
'''.format(AREA))

In [None]:
import cPickle
with open('ways_ru.pcl', 'wb') as pcl:
    cPickle.dump(r, pcl)

## загрузка данных

In [15]:
import cPickle
with open('ways_ru.pcl', 'rb') as pcl:
    r = cPickle.load(pcl)

In [16]:
r.ways[1].tags

{'highway': 'tertiary',
 'name': u'\u041a\u043e\u043c\u0441\u043e\u043c\u043e\u043b\u044c\u0441\u043a\u0430\u044f \u0443\u043b\u0438\u0446\u0430',
 'name:en': "Komsomol'skaja street",
 'name:ru': u'\u041a\u043e\u043c\u0441\u043e\u043c\u043e\u043b\u044c\u0441\u043a\u0430\u044f \u0443\u043b\u0438\u0446\u0430',
 'postal_code': '183038',
 'surface': 'asphalt'}

In [17]:
import pandas as pd

df = pd.DataFrame(
    data=[w.tags for w in r.ways],
    index=[w.id for w in r.ways],
    columns=['name', 'highway', 'name:en', 'name:de']
)
#, 'highway', 'name:en', 'name:de']
df.head()

Unnamed: 0,name,highway,name:en,name:de
4078548,Шепетовская улица,residential,,
4391345,Комсомольская улица,tertiary,Komsomol'skaja street,
4391346,проспект Ленина,primary,Lenina avenue,
4391347,улица Полярные Зори,secondary,Poljarnye Zori street,
4391349,улица Книповича,secondary,Knipovicha street,


In [18]:
len(df)

668675

In [19]:
streets_of = df[
    df.name.str.startswith(u'улица')
    | df.name.str.startswith(u'проспект')
].copy()

In [20]:
import pymorphy2
from functools32 import lru_cache

morph_analyzer = pymorphy2.MorphAnalyzer()

@lru_cache()
def street_to_normalized_surname(s):
    inflected = s.split()[1:]
    morph = [morph_analyzer.parse(word) for word in inflected]
    normalized = []
    for m in morph:
        if not any('NOUN' in form.tag.grammemes for form in m):
            return None
        if not any(form.is_known for form in m):  # the word has not occurred on OpenCorpora, probably a rare surname
            normalized.append(m[0].normalized.word)
            continue
        for form in m:
            if form.is_known and form.tag.grammemes.intersection({'Name', 'Surn'}) and form.tag.case == 'gent':
                normalized.append(form.normal_form)
                break
        else:
            return None
    return ' '.join(normalized) if len(normalized) else None

streets_of['norm_surname'] = streets_of.name.map(street_to_normalized_surname)

In [21]:
streets_of['of'] = streets_of.name.str.rpartition(' ')[2]

In [None]:
len(streets_of)

In [None]:
streets_of.head()

In [None]:
streets_of.norm_surname.value_counts().head(15)

In [None]:
streets_of.norm_surname.value_counts().tail(15)

In [None]:
streets_of.norm_surname.value_counts().iloc[2000:2015]

# Места

## выкачивание данных

In [None]:
import overpy
op = overpy.Overpass()

In [None]:
# http://gis.stackexchange.com/questions/178424/overpass-turbo-area-code-lookup
AREA = "3600060189"  # Russia

In [None]:
r = op.query('''
area({})->.searchArea;
(
    node
    ["name"]["place"]
    (area.searchArea);
);
out;
'''.format(AREA))

In [None]:
import cPickle
with open('places_ru.pcl', 'wb') as pcl:
    cPickle.dump(r, pcl)

## загрузка данных

In [22]:
import cPickle
with open('places_ru.pcl', 'rb') as pcl:
    r = cPickle.load(pcl)

In [23]:
r.nodes[1].tags

{'addr:country': 'RU',
 'addr:district': u'\u041a\u0430\u043d\u0434\u0430\u043b\u0430\u043a\u0448\u0441\u043a\u0438\u0439 \u0440\u0430\u0439\u043e\u043d',
 'addr:postcode': '184041',
 'addr:region': u'\u041c\u0443\u0440\u043c\u0430\u043d\u0441\u043a\u0430\u044f \u043e\u0431\u043b\u0430\u0441\u0442\u044c',
 'int_name': 'Kandalaksha',
 'name': u'\u041a\u0430\u043d\u0434\u0430\u043b\u0430\u043a\u0448\u0430',
 'name:de': 'Kandalakscha',
 'name:en': 'Kandalaksha',
 'name:fi': 'Kantalahti',
 'name:fr': 'Kandalakcha',
 'name:lt': u'Kandalak\u0161a',
 'name:ru': u'\u041a\u0430\u043d\u0434\u0430\u043b\u0430\u043a\u0448\u0430',
 'old_name:sv': 'Kandalaksa',
 'place': 'town',
 'population': '36600',
 'wikipedia': u'ru:\u041a\u0430\u043d\u0434\u0430\u043b\u0430\u043a\u0448\u0430'}

In [114]:
import pandas as pd

def parse_int(line):
    try:
        return int(line.replace(' ', '').replace('~', ''))
    except:
        return 0

places = pd.DataFrame(
    data=[w.tags for w in r.nodes],
    index=[w.id for w in r.nodes],
    columns=['name', 'place', 'population']
)
places.population = places.population.map(parse_int)
places.sort_values("population", ascending=False, inplace=True)
places.head(10)

Unnamed: 0,name,place,population
36966065,Asia,continent,4000000000
424314830,Россия,country,143142000
915341178,Учкент,village,38003921
1686293227,Москва,city,12330126
27490597,Санкт-Петербург,city,5225690
2050243275,Санкт-Петербург,state,5000000
648753998,Воронежская область,state,2328959
27503928,Новосибирск,city,1584138
27503892,Екатеринбург,city,1500394
27505889,Нижний Новгород,city,1267760


In [115]:
places.place.value_counts()

hamlet               109493
village               37090
locality              17125
suburb                 3544
town                   2435
allotments             1661
neighbourhood          1634
isolated_dwelling      1572
city                    195
island                  188
quarter                 164
state                    87
islet                    51
subdistrict              47
archipelago              27
square                   19
farm                     18
islands                   4
allotments_set            3
mountain_range            3
region                    2
historic                  2
peninsula                 2
1-й квартал               1
county                    1
sea                       1
continent                 1
post_box                  1
country                   1
Name: place, dtype: int64

# Фамилии

In [None]:
import bs4
import requests

In [None]:
raw_surnames = []
for i in range(1, 36):
    page = requests.get('http://allfamilii.narod.ru/{}.htm'.format(i))
    parser = bs4.BeautifulSoup(page.text)
    text = parser.select_one('fieldset p').text
    raw_surnames.extend(text.split(', '))

In [None]:
surnames = pd.DataFrame(
    pd.Series(raw_surnames)
        .str.strip()
        .str.strip(',')
        .str.lower()
        .str.replace(u'ё', u'е')
        .sort_values().unique()[2:],
    columns=['surname'],
)

In [90]:
import marisa_trie
trie = marisa_trie.Trie(iter(surnames.surname))

import Stemmer
stemmer = Stemmer.Stemmer('russian')

@lru_cache()
def similar_surnames(word):
    if word is None:
        return None
    stem = stemmer.stemWord(word)
    keys = trie.keys(unicode(stem))
    return ' '.join(keys) if len(keys) else None

In [116]:
places['matched'] = places.name.str.lower().map(similar_surnames)

In [117]:
places['stem'] = places.name.map(stemmer.stemWord)

In [233]:
places[places.matched.notnull()][places.population > 10000].iloc[1:15]

  if __name__ == '__main__':


Unnamed: 0,name,place,population,matched,stem,exact
27504067,Казань,city,1205651,казанцев,Казан,False
27503945,Самара,city,1171820,самарджич,Самар,False
113548786,Киров,city,493336,кириенко кириллин кирай кирос,Кир,False
148901125,Иваново,city,409285,иванов иванова ивановский,Иванов,False
246459003,Калуга,city,342936,калугин,Калуг,False
191771467,Шахты,city,237233,шахтахтинский,Шахт,False
191849717,Королёв,city,220947,королёв,Королёв,True
191657470,Абакан,city,176212,абаканович,Абака,False
207721468,Адлер,suburb,115000,адлер,Адлер,True
191884653,Муром,city,110746,мурата мурзин,Мур,False


In [119]:
places['exact'] = places.name.str.lower().map(unicode).map(trie.__contains__)

In [235]:
places[places.exact][['name', 'place', 'population']].iloc[:15]

Unnamed: 0,name,place,population
191849717,Королёв,city,220947
207721468,Адлер,suburb,115000
27024680,Пушкин,city,100753
82739207,Чайковский,town,83202
93387393,Александров,town,60205
296362274,Яблоновский,town,32039
1921466406,Гагарин,town,29916
310354163,Богданович,town,29421
256361852,Рошаль,town,21167
286214595,Абаза,town,16009


In [98]:
len(places)

175372

## Фамилии/имена из Wikidata

In [2]:
import SPARQLWrapper

In [121]:
def load_wikidata(prop):
    sparql = SPARQLWrapper.SPARQLWrapper("http://query.wikidata.org/sparql")
    
    sparql.setQuery("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?surname ?surnameLabel ?count
WHERE
{
  {
    SELECT ?surname (COUNT(?person) AS ?count) WHERE {
      ?person wdt:P31 wd:Q5.
      ?person wdt:%s ?surname.
    }
    GROUP BY ?surname
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "ru". }
}
ORDER BY DESC(?count)
LIMIT 100
    """ % (prop,))

    sparql.setReturnFormat(SPARQLWrapper.JSON)
    resp = sparql.query().convert()
    
def unpack_values(wd_resp, what):  
    import re
    return pd.DataFrame([
        {
            "uri": entry["surname"]["value"],
            what: entry["surnameLabel"]["value"],
            "count": entry["count"]["value"],
        }
        for entry in surnames_resp["results"]["bindings"]
        if re.match(ur"^[А-Яа-яЁё-]+$", entry["surnameLabel"]["value"])
    ])

In [70]:
def load_surnames():
    return unpack_values(load_wikidata("P734"), "surname")

def load_names():
    return unpack_values(load_wikidata("P735"), "name")

In [122]:
surnames = load_surnames()
surnames.surname = surnames.surname.str.lower().str.replace("ё", "е")

In [132]:
surnames.sample(15)

Unnamed: 0,count,surname,uri
342,47,филипп,http://www.wikidata.org/entity/Q18070769
913,3,бокк,http://www.wikidata.org/entity/Q889117
258,83,дженкинс,http://www.wikidata.org/entity/Q170455
1699,1,хагеманн,http://www.wikidata.org/entity/Q1568341
2347,1,абельская,http://www.wikidata.org/entity/Q25119316
675,7,кэррадайн,http://www.wikidata.org/entity/Q21482744
724,6,аксель,http://www.wikidata.org/entity/Q18404247
1598,1,дегенкольб,http://www.wikidata.org/entity/Q1182849
1449,1,мандельштам,http://www.wikidata.org/entity/Q682519
531,13,янагита,http://www.wikidata.org/entity/Q8048230


In [77]:
load_names()

EndPointInternalError: EndPointInternalError: endpoint returned code 500 and response. 

Response:
SPARQL-QUERY: queryStr=
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?surname ?surnameLabel ?count
WHERE
{
  {
    SELECT ?surname (COUNT(?person) AS ?count) WHERE {
      ?person wdt:P31 wd:Q5.
      ?person wdt:P735 ?surname.
    }
    GROUP BY ?surname
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "ru". }
}
ORDER BY DESC(?count)
LIMIT 100
    
java.util.concurrent.ExecutionException: java.util.concurrent.ExecutionException: org.openrdf.query.QueryInterruptedException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:192)
	at com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:281)
	at com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:653)
	at com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:288)
	at com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)
	at com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:271)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:687)
	at javax.servlet.http.HttpServlet.service(HttpServlet.java:790)
	at org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:808)
	at org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:587)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:143)
	at org.eclipse.jetty.security.SecurityHandler.handle(SecurityHandler.java:577)
	at org.eclipse.jetty.server.session.SessionHandler.doHandle(SessionHandler.java:223)
	at org.eclipse.jetty.server.handler.ContextHandler.doHandle(ContextHandler.java:1127)
	at org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:515)
	at org.eclipse.jetty.server.session.SessionHandler.doScope(SessionHandler.java:185)
	at org.eclipse.jetty.server.handler.ContextHandler.doScope(ContextHandler.java:1061)
	at org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)
	at org.eclipse.jetty.server.handler.ContextHandlerCollection.handle(ContextHandlerCollection.java:215)
	at org.eclipse.jetty.server.handler.HandlerCollection.handle(HandlerCollection.java:110)
	at org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:97)
	at org.eclipse.jetty.server.Server.handle(Server.java:497)
	at org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:310)
	at org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:257)
	at org.eclipse.jetty.io.AbstractConnection$2.run(AbstractConnection.java:540)
	at org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
	at org.eclipse.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.ExecutionException: org.openrdf.query.QueryInterruptedException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:192)
	at com.bigdata.rdf.sail.webapp.QueryServlet$SparqlQueryTask.call(QueryServlet.java:864)
	at com.bigdata.rdf.sail.webapp.QueryServlet$SparqlQueryTask.call(QueryServlet.java:670)
	at com.bigdata.rdf.task.ApiTaskForIndexManager.call(ApiTaskForIndexManager.java:68)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more
Caused by: org.openrdf.query.QueryInterruptedException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at com.bigdata.rdf.sail.Bigdata2Sesame2BindingSetIterator.hasNext(Bigdata2Sesame2BindingSetIterator.java:184)
	at info.aduna.iteration.IterationWrapper.hasNext(IterationWrapper.java:68)
	at org.openrdf.query.QueryResults.report(QueryResults.java:155)
	at org.openrdf.repository.sail.SailTupleQuery.evaluate(SailTupleQuery.java:76)
	at com.bigdata.rdf.sail.webapp.BigdataRDFContext$TupleQueryTask.doQuery(BigdataRDFContext.java:1713)
	at com.bigdata.rdf.sail.webapp.BigdataRDFContext$AbstractQueryTask.innerCall(BigdataRDFContext.java:1569)
	at com.bigdata.rdf.sail.webapp.BigdataRDFContext$AbstractQueryTask.call(BigdataRDFContext.java:1534)
	at com.bigdata.rdf.sail.webapp.BigdataRDFContext$AbstractQueryTask.call(BigdataRDFContext.java:747)
	... 4 more
Caused by: java.lang.RuntimeException: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at com.bigdata.relation.accesspath.BlockingBuffer$BlockingIterator.checkFuture(BlockingBuffer.java:1484)
	at com.bigdata.relation.accesspath.BlockingBuffer$BlockingIterator._hasNext(BlockingBuffer.java:1710)
	at com.bigdata.relation.accesspath.BlockingBuffer$BlockingIterator.hasNext(BlockingBuffer.java:1563)
	at com.bigdata.striterator.AbstractChunkedResolverator._hasNext(AbstractChunkedResolverator.java:365)
	at com.bigdata.striterator.AbstractChunkedResolverator.hasNext(AbstractChunkedResolverator.java:341)
	at com.bigdata.rdf.sail.Bigdata2Sesame2BindingSetIterator.hasNext(Bigdata2Sesame2BindingSetIterator.java:134)
	... 11 more
Caused by: java.util.concurrent.ExecutionException: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at java.util.concurrent.FutureTask.report(FutureTask.java:122)
	at java.util.concurrent.FutureTask.get(FutureTask.java:192)
	at com.bigdata.relation.accesspath.BlockingBuffer$BlockingIterator.checkFuture(BlockingBuffer.java:1454)
	... 16 more
Caused by: java.lang.RuntimeException: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at com.bigdata.rdf.sail.RunningQueryCloseableIterator.checkFuture(RunningQueryCloseableIterator.java:59)
	at com.bigdata.rdf.sail.RunningQueryCloseableIterator.close(RunningQueryCloseableIterator.java:73)
	at com.bigdata.rdf.sail.RunningQueryCloseableIterator.hasNext(RunningQueryCloseableIterator.java:82)
	at com.bigdata.striterator.ChunkedWrappedIterator.hasNext(ChunkedWrappedIterator.java:197)
	at com.bigdata.striterator.AbstractChunkedResolverator$ChunkConsumerTask.call(AbstractChunkedResolverator.java:222)
	at com.bigdata.striterator.AbstractChunkedResolverator$ChunkConsumerTask.call(AbstractChunkedResolverator.java:197)
	... 4 more
Caused by: java.util.concurrent.ExecutionException: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at com.bigdata.util.concurrent.Haltable.get(Haltable.java:273)
	at com.bigdata.bop.engine.AbstractRunningQuery.get(AbstractRunningQuery.java:1516)
	at com.bigdata.bop.engine.AbstractRunningQuery.get(AbstractRunningQuery.java:104)
	at com.bigdata.rdf.sail.RunningQueryCloseableIterator.checkFuture(RunningQueryCloseableIterator.java:46)
	... 9 more
Caused by: com.bigdata.bop.engine.QueryTimeoutException: Query deadline is expired.
	at com.bigdata.bop.engine.RunState.checkDeadline(RunState.java:832)
	at com.bigdata.bop.engine.AbstractRunningQuery.checkDeadline(AbstractRunningQuery.java:376)
	at com.bigdata.bop.engine.QueryDeadline.checkDeadline(QueryDeadline.java:99)
	at com.bigdata.bop.engine.QueryEngine.checkHeadOfDeadlineQueue(QueryEngine.java:846)
	at com.bigdata.bop.engine.QueryEngine.checkDeadlines(QueryEngine.java:817)
	at com.bigdata.bop.engine.QueryEngine.access$100(QueryEngine.java:212)
	at com.bigdata.bop.engine.QueryEngine$QueryEngineTask.run(QueryEngine.java:1088)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at com.bigdata.concurrent.FutureTaskMon.run(FutureTaskMon.java:63)
	... 3 more


## Горы и реки

In [133]:
import overpy
op = overpy.Overpass()

In [134]:
# http://gis.stackexchange.com/questions/178424/overpass-turbo-area-code-lookup
AREA = "3600060189"  # Russia

In [225]:
r = op.query('''
area({})->.searchArea;
(
    node
    ["name"]["natural"~"peak|volcano|ridge"]
    (area.searchArea);
);
out;
'''.format(AREA))

In [226]:
import cPickle
with open('naturals_ru.pcl', 'wb') as pcl:
    cPickle.dump(r, pcl)

## загрузка данных

In [15]:
import cPickle
with open('naturals_ru.pcl', 'rb') as pcl:
    r = cPickle.load(pcl)

In [227]:
r.nodes[0].tags

{'ele': '1545',
 'name': u'\u0420\u043e\u043c\u0430\u043d-\u041a\u043e\u0448',
 'name:de': 'Roman Kosch',
 'name:en': 'Roman Kosh',
 'name:eo': u'Romanko\u015do',
 'name:pl': 'Roman-Kosz',
 'natural': 'peak',
 'summit:cross': 'yes',
 'wikipedia': u'ru:\u0420\u043e\u043c\u0430\u043d-\u041a\u043e\u0448'}

In [162]:
def cut_prefix(prefix, line=None):
    def cutter(line):
        if line.startswith(prefix):
            return line[len(prefix):]
        return line
    return cutter if line is None else cutter(line)

def cut_suffix(prefix, line=None):
    def cutter(line):
        if line.endswith(prefix):
            return line[:len(prefix)]
        return line
    return cutter if line is None else cutter(line)

In [229]:
naturals = pd.DataFrame(
    data=[w.tags for w in r.ways + r.nodes],
    index=[w.id for w in r.ways + r.nodes],
    columns=['name', 'natural']
)

In [220]:
words = []
for line in naturals.name:
    for w in re.split('ur[А-Яа-яЁё-]+', line):
        if w:
            words.append(w)

words[:5]

[u'\u0420\u043e\u043c\u0430\u043d-\u041a\u043e\u0448',
 u'\u043c\u044b\u0441 \u041b\u0438\u043d\u0435\u0439\u043d\u044b\u0439',
 u'\u0438\u0441\u0442\u043e\u043a \u0440\u0435\u043a\u0438 \u0414\u043d\u0435\u043f\u0440',
 u'\u0418\u0441\u0442\u043e\u043a \u0412\u043e\u043b\u0433\u0438',
 u'\u0421\u044b\u0442\u044b\u0439 \u043c\u044b\u0441']

In [221]:
pd.Series(words).value_counts()

Родник                   293
террикон                 109
Лысая                     69
Тополь                    65
Грот                      43
Святой источник           42
Каменная                  39
Острая                    38
Ель                       34
родник                    33
Сезонный (??)             33
Маяк                      32
ск                        31
Белая                     29
5                         26
Источник                  26
берёза осина              25
грот                      23
Дуб                       22
Берёзовая                 22
Колодец                   22
Медвежья                  21
Высокая                   20
Плоская                   19
Крутая                    18
Террикон                  18
Липовая                   18
Круглая                   17
Баджальский хребет        17
Голая                     16
                        ... 
мыс Анциферова             1
3052                       1
бухта Художника            1
Зенгур        

In [230]:
naturals.name = (
    naturals.name
    .str.strip()
    .str.lower()
    .map(cut_prefix(u"мыс ")).map(cut_suffix(u" мыс"))
    .map(cut_prefix(u"гора ")).map(cut_suffix(u" гора"))
    .map(cut_prefix(u"г."))
    .str.strip()
)
naturals = naturals[naturals.name.str.match(u'^[а-яё-]+$')]
naturals.head(50)

Unnamed: 0,name,natural
26864474,роман-кош,peak
265814522,айкуайвенчорр,peak
281891594,опала,volcano
281893797,алаид,volcano
281895277,броутона,peak
281895627,менделеева,volcano
281896663,тятя,volcano
281907079,кетой,volcano
281910771,сарычев,volcano
281913843,райкоке,volcano


In [None]:
sparql = SPARQLWrapper.SPARQLWrapper("http://query.wikidata.org/sparql")

sparql.setQuery("""
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

SELECT ?person ?personLabel
WHERE
{
  ?person wdt:P31 wd:Q5.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "ru". }
}
LIMIT 100
""")

sparql.setReturnFormat(SPARQLWrapper.JSON)
resp = sparql.query().convert()