In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

## Collecting data from Wikipedia

### Cities data

In [2]:
url = 'https://ru.wikipedia.org/wiki/%D0%93%D0%BE%D1%80%D0%BE%D0%B4%D1%81%D0%BA%D0%B8%D0%B5_%D0%BD%D0%B0%D1%81%D0%B5%D0%BB%D1%91%D0%BD%D0%BD%D1%8B%D0%B5_%D0%BF%D1%83%D0%BD%D0%BA%D1%82%D1%8B_%D0%A2%D0%B2%D0%B5%D1%80%D1%81%D0%BA%D0%BE%D0%B9_%D0%BE%D0%B1%D0%BB%D0%B0%D1%81%D1%82%D0%B8'

In [5]:
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
tables = soup.find_all('table')

In [13]:
cities_list = [cell.text.strip() for cell in tables[0].find_all('td')]

In [53]:
cities = cities_list[1::9]
population = cities_list[4::9]
cities_df = pd.DataFrame({'name': cities, 'population': population})

cities_df['population'] = cities_df.population.apply(lambda x: x[1:-3]).str.replace('\xa0', '').astype(int)

# we add region name at the end of the settlement name to avoid wrong results from geocoding,
# since there might be several settlements with the same name
cities_df['geocode_name'] = cities_df.name + ', Тверская область'

In [44]:
cities_df.head()

Unnamed: 0,name,population,geocode_name
0,Андреаполь,6801,"Андреаполь, Тверская область"
1,Бежецк,20418,"Бежецк, Тверская область"
2,Белый,3090,"Белый, Тверская область"
3,Бологое,20498,"Бологое, Тверская область"
4,Весьегонск,6016,"Весьегонск, Тверская область"


### Other urban settlements data

In [46]:
pgt_list = [cell.text.strip() for cell in tables[1].find_all('td')]

In [108]:
pgt = pgt_list[1::9]
mun_name = pgt_list[2::9]
population = pgt_list[4::9]
pgt_df = pd.DataFrame({'name': pgt, 'mun_name': mun_name, 'population': population})

pgt_df['population'] = pgt_df.population.apply(lambda x: x[1:-3]).str.replace('\xa0', '').astype(int)
pgt_df['geocode_name'] = pgt_df.name + ', ' + pgt_df.mun_name + ', Тверская область'
pgt_df = pgt_df.drop('mun_name', axis=1)

In [109]:
pgt_df.head()

Unnamed: 0,name,population,geocode_name
0,Белый Городок,1867,"Белый Городок, Кимрский мун.район, Тверская об..."
1,Васильевский Мох,2150,"Васильевский Мох, Калининский мун.район, Тверс..."
2,Великооктябрьский,1830,"Великооктябрьский, Фировский мун.район, Тверск..."
3,Жарковский,2940,"Жарковский, Жарковский мун.район, Тверская обл..."
4,Изоплит,1588,"Изоплит, Конаковский мун.район, Тверская область"


## Geocoding

In [110]:
from geopy.geocoders import Nominatim

geocoder = Nominatim(user_agent='my_application')

In [111]:
cities_df['type'] = 'city'
pgt_df['type'] = 'pgt'

settlements = cities_df.append(pgt_df)

In [115]:
address, lats, lons = [], [], []

for city in settlements.name:
    addr, (lat, lon) = geocoder.geocode(city)
    address.append(addr)
    lats.append(lat)
    lons.append(lon)
    
settlements_coords = pd.DataFrame({'name': settlements.name, 'address': address, 'lat': lats, 'lon': lons})

In [117]:
settlements_coords.head()

Unnamed: 0,name,address,lat,lon
0,Андреаполь,"Андреаполь, Андреапольский муниципальный округ...",56.646801,32.26532
1,Бежецк,"Бежецк, Бежецкий район, Тверская область, Цент...",57.781357,36.692535
2,Белый,"Белый, городское поселение Белый, Бельский рай...",55.833073,32.939487
3,Бологое,"Бологое, городское поселение Бологое, Бологовс...",57.885204,34.049603
4,Весьегонск,"Весьегонск, Весьегонский муниципальный округ, ...",58.663726,37.26265


In [119]:
settlements = settlements.merge(settlements_coords, on='name').drop('geocode_name', axis=1)

When checking visually the validity of the recieved coordinates, one wrong point was spotted. Here correcting it manually.

In [123]:
settlements.loc[settlements.name == 'Пено', ['lat', 'lon']] = [56.923, 32.7448]

## Preparing settlements for QGIS

In [66]:
import geopandas as gpd

In [125]:
settlements_gdf = \
gpd.GeoDataFrame(settlements,
                 geometry = gpd.points_from_xy(settlements.lon, settlements.lat)).drop(['lat', 'lon'], axis=1)

settlements_gdf.crs = {"init": "EPSG:4326"}
settlements_gdf = settlements_gdf.to_crs(epsg=32637)

In [128]:
settlements_gdf.geometry.nunique()

51

In [127]:
settlements_gdf.to_file('urban_settlements.shp', encoding='utf-8')