# Улицы

## выкачивание данных

In [3]:
import overpy
op = overpy.Overpass()

In [123]:
# http://gis.stackexchange.com/questions/178424/overpass-turbo-area-code-lookup
AREA = "3600060189"  # Russia

In [41]:
r = op.query('''
area({})->.searchArea;
(
    way
    ["name"]["highway"]
    (area.searchArea);
);
out;
'''.format(AREA))

In [47]:
import cPickle
with open('ways_ru.pcl', 'wb') as pcl:
    cPickle.dump(r, pcl)

## загрузка данных

In [490]:
import cPickle
with open('ways_ru.pcl', 'rb') as pcl:
    r = cPickle.load(pcl)

In [322]:
r.ways[1].tags

{'highway': 'tertiary',
 'name': u'\u041a\u043e\u043c\u0441\u043e\u043c\u043e\u043b\u044c\u0441\u043a\u0430\u044f \u0443\u043b\u0438\u0446\u0430',
 'name:en': "Komsomol'skaja street",
 'name:ru': u'\u041a\u043e\u043c\u0441\u043e\u043c\u043e\u043b\u044c\u0441\u043a\u0430\u044f \u0443\u043b\u0438\u0446\u0430',
 'postal_code': '183038',
 'surface': 'asphalt'}

In [491]:
import pandas as pd

df = pd.DataFrame(
    data=[w.tags for w in r.ways],
    index=[w.id for w in r.ways],
    columns=['name', 'highway', 'name:en', 'name:de']
)
#, 'highway', 'name:en', 'name:de']
df.head()

Unnamed: 0,name,highway,name:en,name:de
4078548,Шепетовская улица,residential,,
4391345,Комсомольская улица,tertiary,Komsomol'skaja street,
4391346,проспект Ленина,primary,Lenina avenue,
4391347,улица Полярные Зори,secondary,Poljarnye Zori street,
4391349,улица Книповича,secondary,Knipovicha street,


In [135]:
len(df)

659126

In [482]:
streets_of = df[
    df.name.str.startswith(u'улица')
    | df.name.str.startswith(u'проспект')
].copy()

In [483]:
import pymorphy2
from functools32 import lru_cache

morph_analyzer = pymorphy2.MorphAnalyzer()

@lru_cache()
def street_to_normalized_surname(s):
    inflected = s.split()[1:]
    morph = [morph_analyzer.parse(word) for word in inflected]
    normalized = []
    for m in morph:
        if not any('NOUN' in form.tag.grammemes for form in m):
            return None
        if not any(form.is_known for form in m):  # the word has not occurred on OpenCorpora, probably a rare surname
            normalized.append(m[0].normalized.word)
            continue
        for form in m:
            if form.is_known and form.tag.grammemes.intersection({'Name', 'Surn'}) and form.tag.case == 'gent':
                normalized.append(form.normal_form)
                break
        else:
            return None
    return ' '.join(normalized) if len(normalized) else None

streets_of['norm_surname'] = streets_of.name.map(street_to_normalized_surname)

In [484]:
streets_of['of'] = streets_of.name.str.rpartition(' ')[2]

In [488]:
len(streets_of)

210260

In [485]:
streets_of.head()

Unnamed: 0,name,norm_surname,of
4391346,проспект Ленина,ленин,Ленина
4391347,улица Полярные Зори,,Зори
4391349,улица Книповича,книп,Книповича
4391353,улица Буркова,бурков,Буркова
4391355,улица Радищева,радищев,Радищева


In [378]:
streets_of.norm_surname.value_counts().head(15)

ленин         9929
гагарин       4067
киров         3755
пушкин        3025
калинин       2922
карл маркс    2541
горький       2215
чапаев        2084
свердлов      1483
маяковский    1474
лермонтов     1466
мичурин       1430
фрунзе        1406
гоголь        1319
некрасов      1207
Name: norm_surname, dtype: int64

In [495]:
streets_of.norm_surname.value_counts().tail(15)

фаломеев          1
борохов           1
адамоков          1
новоберкасский    1
петруева          1
кочкарка          1
мотов             1
первомайк         1
гаджикаев         1
п.в.кучумов       1
надежда онайко    1
тахиров           1
абдулбасиров      1
бабякина          1
дурмагашево       1
Name: norm_surname, dtype: int64

In [380]:
streets_of.norm_surname.value_counts().iloc[2000:2015]

надежда суслов    6
батыршина         6
кайгородов        6
грибанов          6
николай чаплин    6
гарнаева          6
мезенцев          6
самаренкин        6
уметбаев          6
корнейчук         6
дронов            6
ачкасова          6
кирьяновый        6
горин             6
мартемьянов       6
Name: norm_surname, dtype: int64

# Места

## выкачивание данных

In [25]:
import overpy
op = overpy.Overpass()

In [26]:
# http://gis.stackexchange.com/questions/178424/overpass-turbo-area-code-lookup
AREA = "3600060189"  # Russia

In [27]:
r = op.query('''
area({})->.searchArea;
(
    node
    ["name"]["place"]
    (area.searchArea);
);
out;
'''.format(AREA))

In [28]:
import cPickle
with open('places_ru.pcl', 'wb') as pcl:
    cPickle.dump(r, pcl)

## загрузка данных

In [431]:
import cPickle
with open('places_ru.pcl', 'rb') as pcl:
    r = cPickle.load(pcl)

In [432]:
import pandas as pd

places = pd.DataFrame(
    data=[w.tags for w in r.nodes],
    index=[w.id for w in r.nodes],
    columns=['name', 'place']
)
places.head()

Unnamed: 0,name,place
26163795,Невель,town
26878551,Кандалакша,town
26898767,Апатиты,town
26898768,Калевала,town
26898769,Кировск,town


In [433]:
places.place.value_counts()

hamlet               109493
village               37090
locality              17125
suburb                 3544
town                   2435
allotments             1661
neighbourhood          1634
isolated_dwelling      1572
city                    195
island                  188
quarter                 164
state                    87
islet                    51
subdistrict              47
archipelago              27
square                   19
farm                     18
islands                   4
allotments_set            3
mountain_range            3
region                    2
historic                  2
peninsula                 2
1-й квартал               1
county                    1
sea                       1
continent                 1
post_box                  1
country                   1
Name: place, dtype: int64

# Фамилии

In [46]:
import bs4
import requests

In [435]:
raw_surnames = []
for i in range(1, 36):
    page = requests.get('http://allfamilii.narod.ru/{}.htm'.format(i))
    parser = bs4.BeautifulSoup(page.text)
    text = parser.select_one('fieldset p').text
    raw_surnames.extend(text.split(', '))

In [486]:
surnames = pd.DataFrame(
    pd.Series(raw_surnames)
        .str.strip()
        .str.strip(',')
        .str.lower()
        .str.replace(u'ё', u'е')
        .sort_values().unique()[2:],
    columns=['surname'],
)

In [478]:
import marisa_trie
trie = marisa_trie.Trie(iter(surnames.surname))

import Stemmer
stemmer = Stemmer.Stemmer('russian')

@lru_cache()
def similar_surnames(word):
    if word is None:
        return None
    stem = stemmer.stemWord(word)
    keys = trie.keys(unicode(stem))
    return ' '.join(keys) if len(keys) else None

In [496]:
places['matched'] = places.name.str.lower().map(similar_surnames)

In [497]:
places[places.matched.notnull()]

Unnamed: 0,name,place,matched
26898826,Зеленоборский,town,зеленоборская
26898992,Пунча,hamlet,пунчер
26923442,Анда,village,андриан андрианова андрианович андриановна анд...
26925161,Мезень,town,мезенцева
26951330,Вологда,city,вологдин
27000366,Тверь,city,тверикин тверитин тверской тверье
27023177,Луга,town,лугов луговский луговцев луговинов лугавцов лу...
27024680,Пушкин,city,пушкин
27090043,Курск,city,курскова
27464485,Петровское,village,петровская петровский
