In [1]:
import json
from bs4 import BeautifulSoup
from html.parser import HTMLParser
from collections import defaultdict

In [2]:
class PageText:
    def __init__(self, title='', description='', keywords='', textDir={}, attrDir={}, number=-1, url=''):
        self._title = title
        self._description = description
        self._keywords = keywords
        self._textDir = textDir
        self._attrDir = attrDir
        self._number = number
        self._url = url
        
    def setNumber(self, number):
        self._number = number
        
    def setUrl(self, url):
        self._url = url
        
    def toDict(self):
        return {"number":self._number,
                "url":self._url,
                "title":self._title, 
                "description":self._description, 
                "keywords": self._keywords,
                "text": self._textDir,
                "attr": self._attrDir}

In [3]:
class TextHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._textDir = defaultdict(list)
        self._interestAttrs = ['title', 'alt']
        self._attrDir = defaultdict(lambda: dict.fromkeys(self._interestAttrs, []))
        
        self._title = ''
        self._description = ''
        self._keywords = ''        
        
        self.__curTag = None
               
    def handle_starttag(self, tag, attrs):
        attrs_dict = dict(attrs)
        if tag == 'meta':            
            name = attrs_dict.get('name', '')
            if name == 'description':
                self._description = attrs_dict.get('content', '').strip()
            if name == 'keywords':
                self._keywords = attrs_dict.get('content', '').strip()   
                
        for attr in self._interestAttrs:
            value = attrs_dict.get(attr, '').strip()
            if value:
                self._attrDir[tag][attr].append(value) 
        self.__curTag = tag
        
    def handle_data(self, data):
        if data.strip():
            data = data.strip()
            if self.__curTag == 'title':
                self._title = self._title + data
            elif self.__curTag != 'script':
                self._textDir[self.__curTag].append(data)
    def pageText(self):
        return PageText(self._title, self._description, self._keywords, self._textDir, self._attrDir)

    def handle_endtag(self, tag):
        pass
    def clear(self):
        self._textDir = defaultdict(list)
        self._attrDir = defaultdict(lambda: dict.fromkeys(self._interestAttrs, []))        
        self._title = ''
        self._description = ''
        self._keywords = ''                
        self.__curTag = None

In [4]:
dataPath = '../data'
docUrlsName = '../data/urls.docs.txt'
textdataPath = '../data/textdata'
templateJson = '{:d}.json'

In [5]:
docDict = {}
with open(docUrlsName) as inputFile:
    for line in inputFile:
        number, url, path = line.strip().split('\t')
        docDict[int(number)] = (url, path)

# Парсим документы(создаем json файлы с текстом и значения некоторых атрибутов, таких как title и alt

In [57]:
def tryParse(parser, fileInput):
    try:
        parser.clear()
        parser.feed(fileInput.read())
        return True
    except Exception:
        return False

In [58]:
encods = ['utf-8', 'windows-1251']
for number, (url, path) in docDict.items():
    relativePath = '{:s}/{:s}'.format(dataPath, path)
    outputPath = '{:s}/{:s}'.format(textdataPath, templateJson.format(number))    
    for encod in encods:
        parser = TextHTMLParser()
        fileInput = open(relativePath, encoding=encod)
        ans = tryParse(parser, fileInput)
        if ans:
            page = parser.pageText()
            page.setNumber(number)
            page.setUrl(url)
            json.dump(page.toDict(), open(outputPath, 'w'))
            fileInput.close()
            break
        fileInput.close()       

In [16]:
relativePath = '{:s}/{:s}'.format(dataPath, path)
fileInput = open(relativePath)
outputPath = '{:s}/{:s}'.format(textdataPath, templateJson.format(number))
try:
    parser.feed(fileInput.read())

26803

Документы, которые не распарсились

In [61]:
import os
import re
pat = re.compile(r'\d+')
procNumbs = set( map(lambda x: int(pat.search(x).group(0)), os.listdir(textdataPath)))
allNumbs = set(docDict.keys())
errorNumbs = allNumbs.difference(procNumbs)

In [65]:
print(len(errorNumbs))

1587


In [64]:
for numb in list(errorNumbs)[:30]:
    print(numb, docDict[numb])

24621 ('http://vmagazine.ru/item/LG_42_LS_570S', 'data/31/3102637490260712758')
24639 ('http://vniigochs.ru/docs/umk/t5/lit5/10ocenka.pdf', 'data/17/-3768526114762576398')
24645 ('http://vocmp.oblzdrav.ru/wp-content/uploads/%E2%84%965-%D0%92%D0%90%D0%A8%D0%95-%D0%97%D0%94%D0%9E%D0%A0%D0%9E%D0%92%D0%AC%D0%95-%D0%BE%D1%82-9-%D0%B0%D0%B2%D0%B3%D1%83%D1%81%D1%82%D0%B0.pdf', 'data/79/5391872310152158217')
77 ('http://112.dou.spb.ru/attachments/article/84/%D0%A0%D0%B0%D0%B1.%D0%BF%D1%80%D0%BE%D0%B3%D1%80%D0%B0%D0%BC%D0%BC%D0%B0%20%D0%BF%D0%BE%D0%B4%D0%B3%D0%BE%D1%82.%20%D0%B3%D1%80.%20%E2%84%9610.pdf', 'data/73/-1105741059929608233')
24657 ('http://vokitai.ru/wp-content/uploads/2010/08/referat1.pdf', 'data/21/5536388841967189545')
84 ('http://12sanepid.ru/upload/Kilemary.doc', 'data/28/-4625889331122850304')
86 ('http://15.1class.ru/rus/files/4b-klass-okruzhayuschiy-mir-2015.pdf', 'data/28/-4531725321918440371')
88 ('http://15-school.ru/dokumenti/Metodich_dokumenti/Rabochie_programmy/Literat

In [36]:
filename = '../data/data/47/8939518307277709638'

In [45]:
tryParse(parser, open(filename, encoding='utf-8'))

False

In [41]:
parser = TextHTMLParser()
parser.clear()
parser.feed(open(filename, encoding='windows-1251').read())
parser.pageText().toDict()

json.dump(parser.pageText().toDict(), open("1.json", 'w'))
json.load( open("1.json"))

{'attr': {'a': {'alt': ['http://www.k-eng.ru/catalog/products/pozharnye-izveshhateli/ip-212-117/',
    'http://www.unitest.ru/netcat_files/128/245/h_d6a42e8777bd330a3f203ac0c80df18d',
    'http://www.unitest.ru/production/catalog/one-home-2/razyasnenie',
    'http://www.unitest.ru/netcat_files/128/245/h_d6a42e8777bd330a3f203ac0c80df18d',
    'http://www.unitest.ru/production/catalog/one-home-2/razyasnenie',
    'http://www.k-eng.ru/catalog/products/pozharnye-izveshhateli/ip212-66-partner/',
    'http://www.bolid.ru/production/noticers/noticers_177.html',
    'http://www.bolid.ru/production/devices/devices_176.html',
    'http://www.bolid.ru/pictures/prilog_r.jpg'],
   'title': ['http://www.k-eng.ru/catalog/products/pozharnye-izveshhateli/ip-212-117/',
    'http://www.unitest.ru/netcat_files/128/245/h_d6a42e8777bd330a3f203ac0c80df18d',
    'http://www.unitest.ru/production/catalog/one-home-2/razyasnenie',
    'http://www.unitest.ru/netcat_files/128/245/h_d6a42e8777bd330a3f203ac0c80df18d

In [None]:
def FileText:
    def __init__(self):
        self._desc = {}
    def setUrl(self, url):
        self._desc["url"] = url
    def setNumber(self, numb):
        self._desc["number"] = numb
    def addTitle(self, title):
        self.

In [11]:
try:
    soup = BeautifulSoup(open(filename))
    page = PageText()
    
    
    
    page.addTitle(soup.title.string)
    
    



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [28]:
parser.feed(open(filename).read())

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xf1 in position 1152: invalid continuation byte