Skip to content
This repository has been archived by the owner on Aug 26, 2022. It is now read-only.

Commit

Permalink
Merge pull request #857 from groovecoder/seo-meta-html-spaces-840550
Browse files Browse the repository at this point in the history
fix bug 840550 - fix PyQuery punctuation spacing
  • Loading branch information
darkwing committed Feb 12, 2013
2 parents fc70f2e + b178b59 commit 0e269c8
Show file tree
Hide file tree
Showing 4 changed files with 150 additions and 84 deletions.
70 changes: 63 additions & 7 deletions apps/wiki/helpers.py
@@ -1,10 +1,13 @@
# coding=utf-8

import difflib
import re
import urllib

import constance.config
from jingo import register
import jinja2
from pyquery import PyQuery as pq
from tidylib import tidy_document
from tower import ugettext as _
import logging
Expand All @@ -13,6 +16,53 @@
from wiki import DIFF_WRAP_COLUMN


def get_seo_description(content, locale=None):
# Create an SEO summary
# TODO: Google only takes the first 180 characters, so maybe we find a
# logical way to find the end of sentence before 180?
seo_summary = ''
try:
if content:
# Need to add a BR to the page content otherwise pyQuery wont find
# a <p></p> element if it's the only element in the doc_html
seo_analyze_doc_html = content + '<br />'
page = pq(seo_analyze_doc_html)

# Look for the SEO summary class first
summaryClasses = page.find('.seoSummary')
if len(summaryClasses):
seo_summary = summaryClasses.text()
else:
paragraphs = page.find('p')
if paragraphs.length:
for p in range(len(paragraphs)):
item = paragraphs.eq(p)
text = item.text()
# Checking for a parent length of 2
# because we don't want p's wrapped
# in DIVs ("<div class='warning'>") and pyQuery adds
# "<html><div>" wrapping to entire document
if (len(text) and
not 'Redirect' in text and
text.find(u'«') == -1 and
text.find('&laquo') == -1 and
item.parents().length == 2):
seo_summary = text.strip()
break
except:
pass

# Post-found cleanup
# remove markup chars
seo_summary = seo_summary.replace('<', '').replace('>', '')
# remove spaces around some punctuation added by PyQuery
if locale == 'en-US':
seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
seo_summary = re.sub(r'(\() ', r'\1', seo_summary)

return seo_summary


def compare_url(doc, from_id, to_id):
return (reverse('wiki.compare_revisions', args=[doc.full_path],
locale=doc.locale)
Expand Down Expand Up @@ -84,10 +134,13 @@ def _massage_diff_content(content):
def bugize_text(content):
content = jinja2.escape(content)
content = re.sub(r'bug\s+#?(\d+)',
jinja2.Markup('<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=\\1" target="_blank">bug \\1</a>'),
jinja2.Markup('<a href="https://bugzilla.mozilla.org/'
'show_bug.cgi?id=\\1" '
'target="_blank">bug \\1</a>'),
content)
return content


@register.function
def format_comment(rev):
""" Massages revision comment content after the fact """
Expand All @@ -97,7 +150,10 @@ def format_comment(rev):

# If a page move, say so
if prev_rev and prev_rev.slug != rev.slug:
comment += jinja2.Markup('<span class="slug-change">Moved From <strong>%s</strong> to <strong>%s</strong></span>') % (prev_rev.slug, rev.slug)
comment += jinja2.Markup('<span class="slug-change">'
'Moved From <strong>%s</strong> '
'to <strong>%s</strong></span>') % (
prev_rev.slug, rev.slug)

return comment

Expand All @@ -112,11 +168,11 @@ def diff_table(content_from, content_to, prev_id, curr_id):
to_lines = tidy_to.splitlines()
try:
diff = html_diff.make_table(from_lines, to_lines,
_("Revision %s") % prev_id,
_("Revision %s") % curr_id,
context=True,
numlines=constance.config.DIFF_CONTEXT_LINES
)
_("Revision %s") % prev_id,
_("Revision %s") % curr_id,
context=True,
numlines=constance.config.DIFF_CONTEXT_LINES
)
except RuntimeError:
# some diffs hit a max recursion error
message = _(u'There was an error generating the content.')
Expand Down
50 changes: 50 additions & 0 deletions apps/wiki/tests/test_helpers.py
@@ -0,0 +1,50 @@
from nose.tools import eq_
from test_utils import TestCase

from wiki.helpers import get_seo_description


class GetSEODescriptionTests(TestCase):

def test_html_elements_spaces(self):
# No spaces with html tags
content = (u'<p><span class="seoSummary">The <strong>Document Object '
'Model'
'</strong> (<strong>DOM</strong>) is an API for '
'<a href="/en-US/docs/HTML" title="en-US/docs/HTML">HTML</a> and '
'<a href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> '
'documents. It provides a structural representation of the '
'document, enabling you to modify its content and visual '
'presentation by using a scripting language such as '
'<a href="/en-US/docs/JavaScript" '
'title="https://developer.mozilla.org/en-US/docs/JavaScript">'
'JavaScript</a>.</span></p>')
expected = ('The Document Object Model (DOM) is an API for HTML and '
'XML'
' documents. It provides a structural representation of the'
' document, enabling you to modify its content and visual'
' presentation by using a scripting language such as'
' JavaScript.')
eq_(expected, get_seo_description(content, 'en-US'))

content = (u'<p><span class="seoSummary"><strong>Cascading Style '
'Sheets</strong>, most of the time abbreviated in '
'<strong>CSS</strong>, is a '
'<a href="/en-US/docs/DOM/stylesheet">stylesheet</a> '
'language used to describe the presentation of a document '
'written in <a href="/en-US/docs/HTML" title="The '
'HyperText Mark-up Language">HTML</a></span> or <a '
'href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> '
'(including various XML languages like <a '
'href="/en-US/docs/SVG" title="en-US/docs/SVG">SVG</a> or '
'<a href="/en-US/docs/XHTML" '
'title="en-US/docs/XHTML">XHTML</a>)<span '
'class="seoSummary">. CSS describes how the structured '
'element must be rendered on screen, on paper, in speech, '
'or on other media.</span></p>')
expected = ('Cascading Style Sheets, most of the time abbreviated in '
'CSS, is a stylesheet language used to describe the '
'presentation of a document written in HTML. CSS '
'describes how the structured element must be rendered on '
'screen, on paper, in speech, or on other media.')
eq_(expected, get_seo_description(content, 'en-US'))
47 changes: 31 additions & 16 deletions apps/wiki/tests/test_views.py
Expand Up @@ -711,20 +711,11 @@ def my_post(url, timeout=None, headers=None, data=None):
ok_(False, "Data wasn't posted as utf8")


class DocumentEditingTests(TestCaseBase):
"""Tests for the document-editing view"""
class DocumentSEOTests(TestCaseBase):
"""Tests for the document seo logic"""

fixtures = ['test_users.json']

def test_noindex_post(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')

# Go to new document page to ensure no-index header works
response = client.get(reverse('wiki.new_document', args=[],
locale=settings.WIKI_DEFAULT_LANGUAGE))
eq_(response['X-Robots-Tag'], 'noindex')

def test_seo_title(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')
Expand Down Expand Up @@ -809,6 +800,21 @@ def make_page_and_compare_seo(slug, content, aught_preview):
' <a href="blah">A link</a> is also &lt;cool&gt;</p>',
'I am awesome. A link is also cool')


class DocumentEditingTests(TestCaseBase):
"""Tests for the document-editing view"""

fixtures = ['test_users.json']

def test_noindex_post(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')

# Go to new document page to ensure no-index header works
response = client.get(reverse('wiki.new_document', args=[],
locale=settings.WIKI_DEFAULT_LANGUAGE))
eq_(response['X-Robots-Tag'], 'noindex')

def test_create_on_404(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')
Expand Down Expand Up @@ -1323,19 +1329,28 @@ def test_invalid_slug_translate(inv_slug, url, data):
def _run_translate_edit_tests(edit_slug, edit_data, edit_doc):

# Hit the initial URL
response = client.get(reverse('wiki.edit_document', args=[edit_doc.slug], locale=foreign_locale))
response = client.get(reverse('wiki.edit_document',
args=[edit_doc.slug],
locale=foreign_locale))
eq_(200, response.status_code)
page = pq(response.content)
eq_(edit_data['slug'], page.find('input[name=slug]')[0].value)

# Attempt an invalid edit of the root, ensure the slug stays the same (i.e. no parent prepending)
# Attempt an invalid edit of the root, ensure the slug stays
# the same (i.e. no parent prepending)
edit_data['slug'] = invalid_slug
edit_data['form'] = 'both'
response = client.post(reverse('wiki.edit_document', args=[edit_doc.slug], locale=foreign_locale), edit_data)
response = client.post(reverse('wiki.edit_document',
args=[edit_doc.slug],
locale=foreign_locale),
edit_data)
eq_(200, response.status_code) # 200 = bad, invalid data
page = pq(response.content)
eq_(invalid_slug, page.find('input[name=slug]')[0].value) # Slug doesn't add parent
self.assertContains(response, page.find('ul.errorlist li a[href="#id_slug"]').text())
# Slug doesn't add parent
eq_(invalid_slug, page.find('input[name=slug]')[0].value)
self.assertContains(response, page.find('ul.errorlist li'
' a[href="#id_slug"]').
text())
eq_(0, len(Document.objects.filter(title=edit_data['title'] + ' Redirect 1', locale=foreign_locale))) # Ensure no redirect

# Push a valid edit, without changing the slug
Expand Down
67 changes: 6 additions & 61 deletions apps/wiki/views.py
@@ -1,11 +1,7 @@
# coding=utf-8

from datetime import datetime
import time
import json
from collections import defaultdict
import base64
import httplib
import hashlib
import logging
from urllib import urlencode
Expand All @@ -16,11 +12,6 @@
except:
from StringIO import StringIO

import requests
import bleach

from taggit.utils import parse_tags, edit_string_for_tags

try:
from functools import wraps
except ImportError:
Expand All @@ -44,7 +35,6 @@
import constance.config

from waffle.decorators import waffle_flag
from waffle import flag_is_active

import jingo
from tower import ugettext_lazy as _lazy
Expand All @@ -57,9 +47,9 @@

from access.decorators import permission_required, login_required
from sumo.helpers import urlparams
from sumo.urlresolvers import Prefixer, reverse
from sumo.urlresolvers import reverse
from sumo.utils import paginate, smart_int
from wiki import (DOCUMENTS_PER_PAGE, TEMPLATE_TITLE_PREFIX, ReadOnlyException)
from wiki import (DOCUMENTS_PER_PAGE, TEMPLATE_TITLE_PREFIX)
from wiki.decorators import check_readonly
from wiki.events import (EditDocumentEvent, ReviewableRevisionInLocaleEvent,
ApproveRevisionInLocaleEvent)
Expand All @@ -68,24 +58,21 @@
TreeMoveForm)
from wiki.models import (Document, Revision, HelpfulVote, EditorToolbar,
DocumentTag, ReviewTag, Attachment,
DocumentRenderingInProgress,
DocumentRenderedContentNotAvailable,
CATEGORIES,
OPERATING_SYSTEMS, GROUPED_OPERATING_SYSTEMS,
FIREFOX_VERSIONS, GROUPED_FIREFOX_VERSIONS,
REVIEW_FLAG_TAGS_DEFAULT, ALLOWED_ATTRIBUTES,
ALLOWED_TAGS, ALLOWED_STYLES,
REVIEW_FLAG_TAGS_DEFAULT,
DOCUMENT_LAST_MODIFIED_CACHE_KEY_TMPL,
get_current_or_latest_revision)
from wiki.tasks import send_reviewed_notification, schedule_rebuild_kb
from wiki.helpers import format_comment
from wiki.tasks import send_reviewed_notification
from wiki.helpers import format_comment, get_seo_description
import wiki.content
from wiki import kumascript

from pyquery import PyQuery as pq
from django.utils.safestring import mark_safe

import logging

log = logging.getLogger('k.wiki')

Expand Down Expand Up @@ -303,48 +290,6 @@ def _get_document_for_json(doc, addLocaleToTitle=False):
}


def get_seo_description(content):
# Create an SEO summary
# TODO: Google only takes the first 180 characters, so maybe we find a
# logical way to find the end of sentence before 180?
seo_summary = ''
try:
if content:
# Need to add a BR to the page content otherwise pyQuery wont find
# a <p></p> element if it's the only element in the doc_html
seo_analyze_doc_html = content + '<br />'
page = pq(seo_analyze_doc_html)

# Look for the SEO summary class first
summaryClasses = page.find('.seoSummary')
if len(summaryClasses):
seo_summary = summaryClasses.text()
else:
paragraphs = page.find('p')
if paragraphs.length:
for p in range(len(paragraphs)):
item = paragraphs.eq(p)
text = item.text()
# Checking for a parent length of 2
# because we don't want p's wrapped
# in DIVs ("<div class='warning'>") and pyQuery adds
# "<html><div>" wrapping to entire document
if (len(text) and
not 'Redirect' in text and
text.find(u'«') == -1 and
text.find('&laquo') == -1 and
item.parents().length == 2):
seo_summary = text.strip()
break
except:
pass

# Post-found cleanup
seo_summary = seo_summary.replace('<', '').replace('>', '')

return seo_summary


@csrf_exempt
@require_http_methods(['GET', 'PUT', 'HEAD'])
@accepts_auth_key
Expand Down Expand Up @@ -578,7 +523,7 @@ def set_common_headers(r):
# Get the SEO summary
seo_summary = ''
if not doc.is_template:
seo_summary = get_seo_description(doc_html)
seo_summary = get_seo_description(doc_html, doc.locale)

# Get the additional title information, if necessary
seo_parent_title = ''
Expand Down

0 comments on commit 0e269c8

Please sign in to comment.