In [12]:
import os
import re
import sys
from collections import Counter
from itertools import islice
from urllib.parse import urlparse, urlsplit, parse_qs, parse_qsl

import numpy as np
import parsel
import sklearn_crfsuite
from sklearn_crfsuite.metrics import flat_classification_report, sequence_accuracy_score
from sklearn.model_selection import cross_val_predict, GroupKFold

sys.path.insert(0, '..')
from autopager.storage import Storage
from autopager.htmlutils import (get_link_text, get_text_around_selector_list,
                                 get_link_href, get_selector_root)
from autopager.utils import (
    get_domain, normalize_whitespaces, normalize, ngrams, tokenize, ngrams_wb, replace_digits
)
from autopager.model import link_to_features, _num_tokens_feature, _elem_attr
from autopager import AUTOPAGER_LIMITS

In [13]:
storage = Storage()
urls = [rec['Page URL'] for rec in storage.iter_records()]
X_raw, y = storage.get_Xy()
print("pages: {}  domains: {}".format(len(urls), len({get_domain(url) for url in urls})))

Not all links are matched {'<a>1</a>'}
Not all links are matched {'<a>6</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a onclick="display(\'grid\');">Mosaico</a>'}
Not all links are matched {'<a class="btn prev disabled" type="backward"><i class="icon-chevron-left"> </i></a>'}
pages: 196  domains: 81


In [14]:
%%time
# XXX: these functions should be copy-pasted from autopager/model.py

def _as_list(generator, limit=None):
    """
    >>> _as_list(ngrams_wb("text", 2, 2), 0)
    []
    >>> _as_list(ngrams_wb("text", 2, 2), 2)
    ['te', 'ex']
    >>> _as_list(ngrams_wb("text", 2, 2))
    ['te', 'ex', 'xt']
    """
    return list(generator if limit is None else islice(generator, 0, limit))


def link_to_features(link):
    text = normalize(get_link_text(link))

    href = get_link_href(link)
    p = urlsplit(href)

    query_parsed = parse_qsl(p.query)
    query_param_names = [k.lower() for k, v in query_parsed]
    query_param_names_ngrams = _as_list(ngrams_wb(
        " ".join([normalize(name) for name in query_param_names]), 3, 5, True
    ))

    elem = get_selector_root(link)
    elem_target = _elem_attr(elem, 'target')
    elem_rel = _elem_attr(elem, 'rel')

    # Classes of link itself and all its children.
    # It is common to have e.g. span elements with fontawesome
    # arrow icon classes inside <a> links.
    self_and_children_classes = ' '.join(link.xpath(".//@class").extract())
    parent_classes = ' '.join(link.xpath('../@class').extract())
    css_classes = normalize(parent_classes + ' ' + self_and_children_classes)

    return {
        'bias': 3.0,
        'isdigit': text.isdigit(),
        'isalpha': text.isalpha(),
        'elem-target': elem_target,
        'elem-rel': elem_rel,
        'num-tokens%s' % _num_tokens_feature(text): 1.0,

        'text': _as_list(ngrams_wb(replace_digits(text), 2, 5),
                         AUTOPAGER_LIMITS.max_text_features),
        'text-exact': replace_digits(text.strip()[:20].strip()),
        'class': _as_list(ngrams_wb(css_classes, 4, 5),
                          AUTOPAGER_LIMITS.max_css_features),
        'query': query_param_names_ngrams,

        'path-has-page': 'page' in p.path.lower(),
        'path-has-pageXX': re.search(r'[/-](?:p|page\w?)/?\d+', p.path.lower()) is not None,
        'path-has-number': any(part.isdigit() for part in p.path.split('/')),

        'href-has-year': re.search('20\d\d', href) is not None,
    }


def page_to_features(xseq):
    features = [link_to_features(a) for a in xseq]
    around = get_text_around_selector_list(xseq, max_length=15)

    # weight is less than 1 because there is a lot of duplicate information
    # in these ngrams and so we want to regularize them stronger
    # (as if they are a single feature, not many features)
    k = 0.2
    for feat, (before, after) in zip(features, around):
        feat['text-before'] = {n: k for n in _as_list(ngrams_wb(normalize(before), 5, 5))}
        feat['text-after'] = {n: k for n in _as_list(ngrams_wb(normalize(after), 5, 5))}
    return features


X = [page_to_features(xseq) for xseq in X_raw]

CPU times: user 9.3 s, sys: 136 ms, total: 9.44 s
Wall time: 9.44 s


In [15]:
# X[60][12]

In [16]:
# TRAIN_SIZE = 80
# X_train, y_train = X[:TRAIN_SIZE], y[:TRAIN_SIZE]
# X_test, y_test = X[TRAIN_SIZE:], y[TRAIN_SIZE:]
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.001,
    c2=0.05,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=False,
)
# crf.fit(X_train, y_train, X_test, y_test)

We must be careful when splitting the dataset into training and
evaluation parts: pages from the same domain should be in the same
"bin". There could be several pages from the same domain, and these
pages may have duplicate or similar link patterns
(e.g. a particular CSS class for paginator links). If we put one such page in a training dataset and another in
an evaluation dataset then the metrics will be too optimistic,
and they can make us to choose wrong features/models.

In [17]:
%%time
group_kfold = GroupKFold(n_splits=6)
groups = [get_domain(url) for url in urls]
y_pred = cross_val_predict(crf, X, y, cv=group_kfold, groups=groups, n_jobs=-1)
print(flat_classification_report(y, y_pred, labels=['PAGE', 'NEXT', 'PREV'], digits=3))
print("Sequence accuracy: {:0.3f}".format(sequence_accuracy_score(y, y_pred)))



              precision    recall  f1-score   support

        PAGE      0.872     0.925     0.898      1178
        NEXT      0.903     0.781     0.837       155
        PREV      0.892     0.812     0.850       112

   micro avg      0.876     0.901     0.888      1445
   macro avg      0.889     0.839     0.862      1445
weighted avg      0.877     0.901     0.888      1445

Sequence accuracy: 0.612
CPU times: user 1min 4s, sys: 1.56 s, total: 1min 5s
Wall time: 1min 20s


In [18]:
crf.fit(X, y)
# crf.attributes_
crf.num_attributes_

11514

In [19]:
# [a for a in sorted(crf.attributes_) if a.startswith('id')]

## What are important features?

In [21]:
import eli5



In [22]:
# XXX: weight for correlated features don't show their importance
# XXX: weights for features of different scale don't show their importance
# (e.g. coefficients to text-after and text-before features are high, but only
# because input is scaled down for these features)

eli5.show_weights(crf, top=50)



From \ To,NEXT,O,PAGE,PREV
NEXT,-1.175,-0.265,1.202,-0.604
O,-1.77,1.929,-1.78,0.039
PAGE,1.687,-1.604,-0.262,0.918
PREV,0.545,-1.859,1.741,-0.84

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3
+2.897,class:next,,
+2.410,text:>,,
+1.929,text-exact:»,,
+1.833,text-exact:>,,
+1.454,text:»,,
+1.441,text-exact:下一页,,
+1.441,text:下一,,
+1.441,text:下一页,,
+1.153,text:一页,,
+0.999,text:ne,,

Weight?,Feature
+2.897,class:next
+2.410,text:>
+1.929,text-exact:»
+1.833,text-exact:>
+1.454,text:»
+1.441,text-exact:下一页
+1.441,text:下一
+1.441,text:下一页
+1.153,text:一页
+0.999,text:ne

Weight?,Feature
+2.258,elem-rel:nofollow
+1.688,text:..
+1.499,text-exact:>>
+1.384,query:m
+1.338,text-after:...
+1.326,href-has-year
+1.326,num-tokens>2
+1.214,text-before:下部
+1.128,text-after:seite
+1.089,text-after:eiten

Weight?,Feature
+2.909,text-exact:末页
+2.909,text:末页
+2.674,isdigit
+1.893,text-exact:首页
+1.880,text-exact:X
+1.847,text:首页
+1.450,text:X
+1.392,text:<<
+1.377,elem-rel:next
+1.357,text:>|

Weight?,Feature
+2.041,text:<
+1.764,text-exact:«
+1.656,text-exact:<
+1.653,class:prev
+1.456,text:«
+1.174,text:上一
+1.174,text:上一页
+1.174,text-exact:上一页
+1.065,text:前の
+1.014,text:一页


## Let's check errors the model is making

In [23]:
group_kfold = GroupKFold(n_splits=6)
groups = [get_domain(url) for url in urls]
y_pred = cross_val_predict(crf, X, y, cv=group_kfold, groups=groups, n_jobs=-1)

errors = np.asarray(y) != np.asarray(y_pred)
error_rows = np.asarray(list(storage.iter_records()))[errors]
error_links = np.asarray(X_raw)[errors]
error_y_pred = y_pred[errors]
error_y_true = np.asarray(y)[errors]



In [24]:
for links, yseq_pred, yseq_true, row in zip(error_links, error_y_pred, error_y_true, error_rows):
    print(row['Page URL'])
    for label_correct, label_pred, link in zip(yseq_true, yseq_pred, links.extract()):
        if label_correct != label_pred:
            print("%4s %4s %s" % (label_correct, label_pred, link))
    print("\n")

https://www.mypapershop.com/patricks-day-supplies.html
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>
NEXT    O <a href="https://www.mypapershop.com/mm5/merchant.mvc?Session_ID=56e7cf6116038eec3b953a976519a103&amp;Store_Code=MPS&amp;Screen=CTGY&amp;Category_Code=patricks-day-supplies&amp;CatListingOffset=24&amp;Offset=24&amp;Per_Page=24&amp;Sort_By=disp_order" class="searchspring-next">▶</a>


http://www.newschittagong24.com/?cat=1&paged=5
PAGE    O <a href="http://www.newschittagong24.com/?cat=1" class="first" title="« প্রথম">« প্রথম</a>


http://www.newschittagong24.com/?cat=1&paged=1422
PAGE PREV <a href="http://www.newschittagong24.com/?cat=1" class="first" title="« প্রথম">« প্রথম</a>


https://www.icontact.com/blog
NEXT PREV <a href="https

## Unused code

In [25]:
def _url_parts(url):
    p = urlsplit(url)
    args = parse_qsl(p.query)
    argnames = [name for name, value in args]
    return p.netloc, set(p.path.split('/')) | set(args) | set(argnames)

def url_distance(url1, url2):
    netloc1, parts1 = _url_parts(url1)
    netloc2, parts2 = _url_parts(url2)
    if netloc1 != netloc2:
        return 1.0
    if not parts1 and not parts2:
        return 0.0
    return 1 - len(parts1 & parts2) / len(parts1 | parts2)

#         dist = url_distance(url, href)
#         if dist == 0:
#             feat['url-distance=0'] = 1.0
#         elif dist == 1.0:
#             feat['url-distance=1'] = 1.0
#         else:
#             feat['url-distance=k'] = dist


url_distance('http://example.com/foo/345?page=2', 'http://example.com/foo/345?page=4')

0.33333333333333337

In [26]:
# def guess_page_number(link):
#     text = get_link_text(link).strip()
#     if text.isdigit():
#         return int(text)
#     return None

# def number_pattern2(pattern):
#     txt = re.sub('X+', 'X', pattern)
# #     txt = re.sub('C+', 'C', txt)
#     return txt

#     pagenums = [guess_page_number(a) for a in xseq]
# #     print(pagenums)
#     for i in range(1, len(xseq)):
#         if pagenums[i-1] is None or pagenums[i] is None:
#             features[i]['page-diff:None'] = 1.0
#         else:
#             diff = pagenums[i] - pagenums[i-1]
#             if diff == 1:
#                 features[i]['page-diff==1'] = 1.0
#             else:
#                 features[i]['page-diff<>1'] = 1.0
