In [None]:
import attr

@attr.s
class BiblioRecord(object):
    number = attr.ib(default=None)
    title = attr.ib(default=None)
    local_title = attr.ib(default=None)
    title_data = attr.ib(default=None)
    organization = attr.ib(default=None)
    first_year = attr.ib(default=None)
    place = attr.ib(default=None)
    additional_data = attr.ib(default=None)
    heading = attr.ib(default=None)
    price = attr.ib(default=None)
    period = attr.ib(default=None)
    edition = attr.ib(default=None)
    source = attr.ib(default=None)
    editiontype = attr.ib(default=None)
    
    def __repr__(self):
        return 'BiblioRecord("{}")'.format(self.title.encode('utf-8'))

                

In [6]:
import re

PERIOD_STR = (
    ur'(?P<monthly>(Еже|еже|еже-|Еже-|Двух|Двух-|двух|двух-)мес)|(?P<trimestrial>((Еже|еже|)кварт)|Кварт)|'
    ur'(?P<irregular>(8 раз в год))'
)
PERIOD_REG = re.compile(PERIOD_STR)

ED = (
    ur'(?P<year>(19[0-9][0-9] — 19[0-9][0-9])|(19[0-9][0-9]))[\.-]?[ \t\n]*'
    ur'(?P<price>([0-9]* *[рк]\.? *[0-9]* *[рк]*\.*,)?) ?'
    ur'(?P<edition>([0-9]*\.?[0-9]*\.?[0-9]*[ \t\n]?[—-]?[ \n\t]?[0-9]*\.?[0-9]*\.?[0-9]+[\t ]?экз\.))'
)
EDIT = re.compile(ED)

START_BLOCK_ID = 1455

BEG_YEAR_OF_BOOK = 1986

SOURCE = u"Летопись печатных и периодических изданий СССР. Журналы. 1986 - 1990"

EDITIONTYPE = u'журнал'

YEAR_STR = ur'1[7-9][0-9][0-9]'
RE_YEAR = re.compile(YEAR_STR)

PRICE_STR = ur'(?P<price>([0-9]+ ?[рк]\. ?[0-9]* *[рк]*\.*,?))'
RE_PRICE = re.compile(PRICE_STR)

NODATA = u'Нет данных'
PARCIGLACUNA = u'Пропуск'


def clean_str(text):
    return text.replace('\n', '').strip()
    

def parse_blocks(data, block_id):
    blocks = []
    buff = []
    
    for line in data.split('\n'):
        if line.strip().startswith(str(block_id)):
            blocks.append({'text': '\n'.join(buff), 'number': block_id - 1})
            block_id += 1
            buff = []
        buff.append(line)
    
    blocks.append({'text': '\n'.join(buff), 'number': block_id - 1})
    
    return blocks

def parse_period(block):
    m = PERIOD_REG.search(clean_str(block['text']))
    if m:
        period = max(m.group(u'monthly'), m.group('trimestrial'), m.group('irregular'))
    else: 
        period = NODATA
    return period

def parse_edition(block):
    edition = {
        item.group('year'): item.group('edition')
        for item in EDIT.finditer(block['text'])
    }
    return edition

def parse_firstyear(block, dashpos, second_dashpos):
    first_year = RE_YEAR.search(block['text'][dashpos + 1:second_dashpos])
    if first_year:
        return first_year.group(0)
    else:
        return NODATA
    
def parse_price(block):
    p = RE_PRICE.search(clean_str(block['text']))
    if p:
        price = p.group(u'price')
    else: 
        price = NODATA
    return price


def parse_data_block(block, heading, source, editiontype):
    brecord = BiblioRecord()
    title_start = len(str(block['number'])) + 1
    
    colonpos = block['text'].find(':')
    slashpos = block['text'].find('/', colonpos)
    dashpos = block['text'].find(u'—', max(colonpos, slashpos))
    second_dashpos = block['text'].find(u'—', dashpos + 1)
    third_dashpos = block['text'].find(u'—', second_dashpos + 1)
    comma = block['text'].find(',', third_dashpos)
    issn = block['text'].find(u'ИССН', comma)
    enddatapos = slashpos if slashpos != -1 else dashpos
    second_colonpos = block['text'].find(':', issn)
    numberpos = block['text'].find(u'№', second_colonpos)
    journ = block['text'].find(u'журн.')
        
    brecord.number = block['number']
    brecord.title = clean_str(block['text'][title_start:colonpos])
    titles = brecord.title.split('=')
    
    if len(titles) > 1:
        brecord.local_title = clean_str(titles[1])
        brecord.title = clean_str(titles[0])
    else: 
        brecord.local_title = NODATA
        
                      
    brecord.title_data = clean_str(block['text'][colonpos + 1:enddatapos])
    if len(brecord.title_data) < 1:
        brecord.title_data = NODATA
    brecord.organization = clean_str(block['text'][slashpos:dashpos])
    if len(brecord.organization) == 0:
        brecord.organization = clean_str(block['text'][journ + 5:dashpos])
    if len(brecord.organization) == 0:
        brecord.organization = PARCIGLACUNA
        
    brecord.first_year = parse_firstyear(block, dashpos, second_dashpos)
    brecord.place = clean_str(block['text'][second_dashpos + 4:comma])
    brecord.additional_data = clean_str(block['text'][comma + 1:issn])
    brecord.heading = heading
    brecord.price = clean_str(parse_price(block))
    brecord.period = clean_str(parse_period(block))
    brecord.edition = parse_edition(block)
    if len(brecord.organization) == 0:
        brecord.organization = clean_str(block['text'][journ + 5:dashpos])
    brecord.source = SOURCE
    brecord.editiontype = EDITIONTYPE
        
    return brecord
        

def parse(path, block_id, source, editiontype):
    data = open(path).read().decode('utf-8')
    brecords = []
    
    blocks = parse_blocks(data, block_id)
    
    for block in blocks:
        print block['text'].strip()
        if block['text'].strip().endswith('.'):
            brecord = parse_data_block(block, heading, source, editiontype)
            brecords.append(brecord)
        else:            
            end_block = block['text'].rfind('.\n')
            if end_block != -1:
                heading = clean_str(block['text'][end_block+1:])
            else:
                heading = clean_str(block['text'].split('\n')[2])
    
    return brecords
        
    
a = parse('lppi_1986_1990_sbr.txt', START_BLOCK_ID, SOURCE, EDITIONTYPE)
for brecord in a:
    print brecord.number, brecord.title
    #print brecord.number, brecord.local_title
    #print brecord.number, brecord.title_data
    #print brecord.number, brecord.organization #Улучшила парсинг организации
    #print brecord.number, brecord.first_year # Почистила первый год
    #print brecord.number, brecord.place
    #print brecord.number, brecord.additional_data
    #print brecord.number, brecord.heading
    #print brecord.number, brecord.price
    #print brecord.number, brecord.period
    #print brecord.number, u''.join([u'{}: {} '.format(year, ed) for year, ed in brecord.edition.items()])
    #print brecord.number, brecord.source
    #print brecord.number, brecord.editiontype

﻿37. ЗДРАВООХРАНЕНИЕ. МЕДИЦИНСКИЕ НАУКИ

Общие вопросы. Организация здравоохранения.

Медицинская промышленность. Медицинская техника. Гигиена и санитария
Актуальные вопросы биологии и медицины. — Л. — См. 838.


UnboundLocalError: local variable 'heading' referenced before assignment

In [None]:
outfile = open('lppi_jurn_1986_1990.csv', 'w')
writer = csv.writer(outfile)
first_str = ('number', 'title', 'local_title', 'title_data', 'organization', 'first_year', 'place', 'additional_data', 'heading', 'price', 'period', 'edition1986', 'edition1987', 'edition1988', 'edition1989', 'edition1990' 'source', 'editiontype')
writer.writerow(first_str)

for brecord in a:
    row = []
    row.append(brecord.number)
    row.append(brecord.title.encode('utf-8'))# Есть все названия
    row.append(brecord.local_title.encode('utf-8')) # Все позиции заполнены
    row.append(brecord.title_data.encode('utf-8'))
    row.append(brecord.organization.encode('utf-8'))
    row.append(brecord.first_year.encode('utf-8'))
    row.append(brecord.place.encode('utf-8'))
    row.append(brecord.additional_data.encode('utf-8'))
    row.append(brecord.heading.encode('utf-8'))
    row.append(brecord.price.encode('utf-8'))
    row.append(brecord.period.encode('utf-8'))
    if brecord.edition.get('1986'):
        edition1986 = brecord.edition.get('1986')
        row.append(edition1986.encode('utf-8'))
    else:
        row.append(NODATA.encode('utf-8'))
    if brecord.edition.get('1987'):
        edition1987 = brecord.edition.get('1987')
        row.append(edition1987.encode('utf-8'))
    else:
        row.append(NODATA.encode('utf-8'))
    if brecord.edition.get('1988'):
        edition1988 = brecord.edition.get('1988')
        row.append(edition1988.encode('utf-8'))
    else:
        row.append(NODATA.encode('utf-8'))
    if brecord.edition.get('1989'):
        edition1989 = brecord.edition.get('1989')
        row.append(edition1989.encode('utf-8'))
    else:
        row.append(NODATA.encode('utf-8'))
    if brecord.edition.get('1990'):
        edition1990 = brecord.edition.get('1990')
        row.append(edition1990.encode('utf-8'))
    else:
        row.append(NODATA.encode('utf-8'))        
    row.append(brecord.source.encode('utf-8'))
    row.append(brecord.editiontype.encode('utf-8'))

    writer.writerow(row)
    
outfile.close()

In [None]:
import re
#    (?P<edition>([0-9]*\.?[0-9]*\.?[0-9]+[\t ]?—? ?\n?[0-9]*\.?[0-9]*\.?[0-9]+ ?\t?экз\.))'
pr = (
    ur'(?P<year>(19[0-9][0-9] — 19[0-9][0-9])|(19[0-9][0-9]))[\.-]?[ \t\n]*'
    ur'(?P<price>([0-9]* *[рк].? *[0-9]* *[рк]*\.*,)?) ?'
    ur'(?P<edition>([0-9]*\.?[0-9]*\.?[0-9]*[ \t\n]?[—-]?[ \n\t]?[0-9]*\.?[0-9]*\.?[0-9]+[\t ]?экз\.))'
)
textpr = u'''1986. 6.475 — 6.560 экз.; 1987. 5.632 — 7.293 экз.; 1988. 7.316 — 7.395 экз.; 
        1989. 5.937 — 6.669 экз.; 1990. 6.127 — 6.178 экз.'''


prpr = re.compile(pr)

m = prpr.search(textpr)
if m:
    print m.groupdict()
    print m.groupdict()['price']


In [None]:
# Тесты парсера тиражей
import unittest
import re

TESTED = (
    ur'(?P<year>(19[0-9][0-9] — 19[0-9][0-9])|(19[0-9][0-9]))[\.\,-]?[ \t\n]*'
    ur'(?P<price>([0-9]* *[рк].? *[0-9]* *[рк]*\.*,)?) ?'
    ur'(?P<edition>([0-9]*\.?[0-9]*\.?[0-9]*[ \t\n]?[—-]?[ \n\t]?[0-9]*\.?[0-9]*\.?[0-9]+[\t ]?экз\.))'
)
TESTEDIT = re.compile(TESTED)

class TestParcerEdition(unittest.TestCase):
    
    def test_edition_1(self):#Количество годов - 5, тиражи измеряются в тысячах
        text = u'''1986. 6.475 — 6.560 экз.; 1987. 5.632 — 7.293 экз.; 1988. 7.316 — 7.395 экз.; 
        1989. 5.937 — 6.669 экз.; 1990. 6.127 — 6.178 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        for item in values:
            self.assertEqual(len(item), 18)
        self.assertEqual(len(edition), 5)
    
    def test_edition_2(self):#Количество годов - 5, тиражи измеряются в десятках тысяч
        text = u'''1986. 20.921 — 21.465 экз.; 1987. 20.530 — 21.111 экз.; 1988. 19.282 — 19.972 экз.;
        1989. 18.804 — 19.471 экз.; 1990. 17.217 — 17.778 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        #print len(edition)
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertEqual(len(item), 20)
        self.assertEqual(len(edition), 5)
       
    def test_edition_3(self):#Количество годов - 5, тиражи измеряются в сотнях тысяч
        text = u'''1986. 640.050 — 648.122 экз.; 1987. 634.261
—	643.188 экз.; 1988. 577.841 — 586.171 экз.; 1989. 512.518 — 518.993 экз.; 1990. 395.023 — 400.320 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertEqual(len(item), 22)
        self.assertEqual(len(edition), 5)
    
    def test_edition_4(self):#Количество годов - 5, тиражи измеряэтся в миллионах, тираж без диапазона
        text = u'''1986. 15.950.000 — 16.000.000 экз.; 1987.
15.600.000	— 16.700.000 экз.; 1988. 16.800.000 экз.; 1989. 15.700.000 — 16.800.000 экз.; 1990.
17.860.000	— 18.155.000 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        #print edition
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertTrue(len(item) in [28, 15])
        self.assertEqual(len(edition), 5)
       
    def test_edition_5(self):#Количество годов - 1, присутсвуте группа цены
        text = u'''1990. 45 к., 4.780 — 6.041 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertEqual(len(item), 18)
        self.assertEqual(len(edition), 1)
        
        
    def test_edition_6(self):#Годы указаны диапазоном
        text = u'''1986. 5.730 — 5.800 экз.; 1987 — 1988.
6.000	экз.; 1989. 5.800 экз.; 1990. 4.500 —
6.000	экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        keys = edition.keys()
        for item in values:
            #print len(item)
            self.assertTrue(len(item)in [18, 10])
        for item in keys:
            self.assertTrue(len(item) in [11, 4])
        self.assertEqual(len(edition), 4)
    
    def test_edition_7(self):#Год заканчивается не точкой
        text = u'''1986- 2.795 экз.; 1987. 2.743 экз.; 1988. 2.522 экз.; 1989. 2.563 экз.; 1990. 2.213 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertTrue(len(item) in [10])
        self.assertEqual(len(edition), 5)
    
    def test_edition_8(self):#Вместо среднего тире - минус
        text = u'''1986. 15.374 — 15.569 экз.; 1987. 16.105 — 16.332 экз.; 
        1988. 17.406 — 17.742 экз.; 1989. 17.408 - 17.629 экз.; 1990. 16.333 - 16.592 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertEqual(len(item), 20)
        self.assertEqual(len(edition), 5)
        
    def test_edition_9(self):#Есть инфа о номерах, запятая после года
        text = u'''1990. 1 р. 3.000 экз.'''       
        edition = {
        item.group('year'): item.group('edition')
        for item in TESTEDIT.finditer(text)
        }
        print edition
        values = edition.values()
        for item in values:
            #print len(item)
            self.assertTrue(len(item) in [18, 10])
        self.assertEqual(len(edition), 1)
    
    

unittest.main(argv=['ignored', '-v'], exit=False)