# Инициализация

In [276]:
import pandas as pd
import os
import re
from tqdm import tqdm_notebook

In [2]:
filenames = [filename for filename in os.listdir('testset') if '.txt' in filename and filename != 'list.txt']

In [3]:
df_russian_names = pd.read_csv('./data/russian_names.csv', delimiter=';')
russian_names = df_russian_names['Name'].values

In [4]:
df_russian_surnames = pd.read_csv('./data/russian_surnames.csv', delimiter=';')
russian_surnames = df_russian_surnames['Surname'].values

In [5]:
df_foreign_names = pd.read_csv('./data/foreign_names.csv', delimiter=';')
foreign_names = df_foreign_names['name']

In [96]:
df_cities = pd.read_csv('./data/city.csv', delimiter=';')
cities = df_cities['name'].values

In [97]:
df_regions = pd.read_csv('./data/region.csv', delimiter=';')
regions = df_regions['name'].values

In [98]:
df_coutries = pd.read_csv('./data/country.csv', delimiter=';')
countries = df_coutries['name'].values

# Извлечение организаций

In [121]:
from bs4 import BeautifulSoup
import urllib
import urllib3

In [114]:
pages = ['0'] + [chr(c) for c in range(ord('А'), ord('Я'))] + [chr(c) for c in range(ord('A'), ord('Z'))]

In [201]:
organizations = set()
base_link = 'https://ru.wikipedia.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9A%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D0%B8_%D0%BF%D0%BE_%D0%B0%D0%BB%D1%84%D0%B0%D0%B2%D0%B8%D1%82%D1%83'

for page in tqdm_notebook(pages):
    link = base_link + '&from=' + urllib.parse.quote(page)
    f = urllib.request.urlopen(link)
    html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.select('div div ul li a')
    for div in divs:
        organizations.add(div.text)

100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [00:40<00:00,  1.41it/s]


In [202]:
base_link = 'https://ru.wikipedia.org/w/index.php?title=%D0%9A%D0%B0%D1%82%D0%B5%D0%B3%D0%BE%D1%80%D0%B8%D1%8F:%D0%9E%D1%80%D0%B3%D0%B0%D0%BD%D0%B8%D0%B7%D0%B0%D1%86%D0%B8%D0%B8_%D0%BF%D0%BE_%D0%B0%D0%BB%D1%84%D0%B0%D0%B2%D0%B8%D1%82%D1%83'

for page in tqdm_notebook(pages):
    link = base_link + '&from=' + urllib.parse.quote(page)
    f = urllib.request.urlopen(link)
    html = f.read()
    soup = BeautifulSoup(html, 'html.parser')
    divs = soup.select('div div ul li a')
    for div in divs:
        organizations.add(div.text)

100%|██████████████████████████████████████████████████████████████████████████████████| 57/57 [00:38<00:00,  1.48it/s]


In [203]:
len(organizations)

9479

In [212]:
with open('./data/organizations.csv', 'w', encoding='utf8') as f:
    f.writelines(organizations)

# Формирование регулярных выражений

## Персоны

In [251]:
name_res = ['%s[а-я]?(?=[^а-я])' % name[:-1] for name in russian_names if len(name[:-2]) > 3]
surname_res = ['%s[а-я]{1,2}(?=[^а-я])' % surname[:-2] for surname in russian_surnames if len(surname[:-2]) > 3]

In [284]:
def get_per(text):
    res = set()
    # имена
    for name_re in tqdm_notebook(name_res):
        matches = re.finditer(name_re, text)
        for m in matches:
            span = m.span()
            res.add('PER %d %d\n' % (span[0], span[1] - span[0]))
    # фамилии
    for surname_re in tqdm_notebook(surname_res):
        matches = re.finditer(surname_re, text)
        for m in matches:
            span = m.span()
            res.add('PER %d %d\n' % (span[0], span[1] - span[0]))
    
    return res

## Организации

In [285]:
org_res = organizations

In [286]:
def get_org(text):
    res = set()
    for org_re in tqdm_notebook(org_res):
        matches = re.finditer(re.escape(org_re), text) # Для избежания ошибки "error: unbalanced parenthesis at position"
        for m in matches:
            span = m.span()
            res.add('ORG %d %d\n' % (span[0], span[1] - span[0]))
    return res

## Локации

In [287]:
loc_res = ['%s[а-я](?=[^а-я])' % loc for loc in set(cities.tolist() + regions.tolist() + countries.tolist())]

In [288]:
def get_loc(text):
    res = set()
    for loc_re in tqdm_notebook(loc_res):
        matches = re.finditer(loc_re, text)
        for m in matches:
            span = m.span()
            res.add('LOC %d %d\n' % (span[0], span[1] - span[0]))
    return res

# Выполнение

In [None]:
for filename in tqdm_notebook(filenames):
    with open('./testset/' + filename, encoding='utf8') as f:
        text = f.read()
    results = list(get_per(text)) + list(get_org(text)) + list(get_loc(text))
    with open('./results/' + filename.split('.')[0] + '.task1', 'w') as f:
        f.writelines(results)

# Проверяем результаты

In [2]:
!python scripts/t1_eval.py -s ./testset -t ./results -o ./output

book_3539
Type    P        R        F1       TP1      TP2      In Std.  In Test.
per        0.2097   0.5430   0.3026   658.17   658.17     1212     3138
loc        0.3854   0.1247   0.1884    67.83    67.83      544      176
org        0.7374   0.1559   0.2574   246.30   246.30     1580      334
locorg     1.0000   0.0000   0.0000     0.00     0.00      626        0
overall    0.2665   0.2454   0.2555   972.30   972.30     3962     3648
