Permalink
Browse files

fix bug 840550 - fix PyQuery punctuation spacing

  • Loading branch information...
1 parent 07d2cc3 commit 87c326191c7ae80c9a72f53ca331741aedc563c1 @groovecoder groovecoder committed Feb 12, 2013
Showing with 99 additions and 55 deletions.
  1. +50 −0 apps/wiki/helpers.py
  2. +30 −0 apps/wiki/tests/test_helpers.py
  3. +17 −11 apps/wiki/tests/test_views.py
  4. +2 −44 apps/wiki/views.py
View
50 apps/wiki/helpers.py
@@ -1,10 +1,13 @@
+# coding=utf-8
+
import difflib
import re
import urllib
import constance.config
from jingo import register
import jinja2
+from pyquery import PyQuery as pq
from tidylib import tidy_document
from tower import ugettext as _
import logging
@@ -13,6 +16,53 @@
from wiki import DIFF_WRAP_COLUMN
+def get_seo_description(content, locale=None):
+ # Create an SEO summary
+ # TODO: Google only takes the first 180 characters, so maybe we find a
+ # logical way to find the end of sentence before 180?
+ seo_summary = ''
+ try:
+ if content:
+ # Need to add a BR to the page content otherwise pyQuery wont find
+ # a <p></p> element if it's the only element in the doc_html
+ seo_analyze_doc_html = content + '<br />'
+ page = pq(seo_analyze_doc_html)
+
+ # Look for the SEO summary class first
+ summaryClasses = page.find('.seoSummary')
+ if len(summaryClasses):
+ seo_summary = summaryClasses.text()
+ else:
+ paragraphs = page.find('p')
+ if paragraphs.length:
+ for p in range(len(paragraphs)):
+ item = paragraphs.eq(p)
+ text = item.text()
+ # Checking for a parent length of 2
+ # because we don't want p's wrapped
+ # in DIVs ("<div class='warning'>") and pyQuery adds
+ # "<html><div>" wrapping to entire document
+ if (len(text) and
+ not 'Redirect' in text and
+ text.find(u'«') == -1 and
+ text.find('&laquo') == -1 and
+ item.parents().length == 2):
+ seo_summary = text.strip()
+ break
+ except:
+ pass
+
+ # Post-found cleanup
+ # remove markup chars
+ seo_summary = seo_summary.replace('<', '').replace('>', '')
+ # remove spaces around some punctuation added by PyQuery
+ if locale == 'en-US':
+ seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
+ seo_summary = re.sub(r'(\() ', r'\1', seo_summary)
+
+ return seo_summary
+
+
def compare_url(doc, from_id, to_id):
return (reverse('wiki.compare_revisions', args=[doc.full_path],
locale=doc.locale)
View
30 apps/wiki/tests/test_helpers.py
@@ -0,0 +1,30 @@
+from nose.tools import eq_
+from test_utils import TestCase
+
+from wiki.helpers import get_seo_description
+
+
+class GetSEODescriptionTests(TestCase):
+
+ def test_html_elements_spaces(self):
+ # No spaces with html tags
+ content = (u'<p><span class="seoSummary">The <strong>Document Object Model'
+ '</strong> (<strong>DOM</strong>) is an API for '
+ '<a href="/en-US/docs/HTML" title="en-US/docs/HTML">HTML</a> and '
+ '<a href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> '
+ 'documents. It provides a structural representation of the '
+ 'document, enabling you to modify its content and visual '
+ 'presentation by using a scripting language such as '
+ '<a href="/en-US/docs/JavaScript" '
+ 'title="https://developer.mozilla.org/en-US/docs/JavaScript">'
+ 'JavaScript</a>.</span></p>')
+ expected = ('The Document Object Model (DOM) is an API for HTML and XML'
+ ' documents. It provides a structural representation of the'
+ ' document, enabling you to modify its content and visual'
+ ' presentation by using a scripting language such as'
+ ' JavaScript.')
+ eq_(expected, get_seo_description(content, 'en-US'))
+
+ content = (u'<p><span class="seoSummary"><strong>Cascading Style Sheets</strong>, most of the time abbreviated in <strong>CSS</strong>, is a <a href="/en-US/docs/DOM/stylesheet">stylesheet</a> language used to describe the presentation of a document written in <a href="/en-US/docs/HTML" title="The HyperText Mark-up Language">HTML</a></span> or <a href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> (including various XML languages like <a href="/en-US/docs/SVG" title="en-US/docs/SVG">SVG</a> or <a href="/en-US/docs/XHTML" title="en-US/docs/XHTML">XHTML</a>)<span class="seoSummary">. CSS describes how the structured element must be rendered on screen, on paper, in speech, or on other media.</span></p>')
+ expected = ('Cascading Style Sheets, most of the time abbreviated in CSS, is a stylesheet language used to describe the presentation of a document written in HTML. CSS describes how the structured element must be rendered on screen, on paper, in speech, or on other media.')
+ eq_(expected, get_seo_description(content, 'en-US'))
View
28 apps/wiki/tests/test_views.py
@@ -711,20 +711,11 @@ def my_post(url, timeout=None, headers=None, data=None):
ok_(False, "Data wasn't posted as utf8")
-class DocumentEditingTests(TestCaseBase):
- """Tests for the document-editing view"""
+class DocumentSEOTests(TestCaseBase):
+ """Tests for the document seo logic"""
fixtures = ['test_users.json']
- def test_noindex_post(self):
- client = LocalizingClient()
- client.login(username='admin', password='testpass')
-
- # Go to new document page to ensure no-index header works
- response = client.get(reverse('wiki.new_document', args=[],
- locale=settings.WIKI_DEFAULT_LANGUAGE))
- eq_(response['X-Robots-Tag'], 'noindex')
-
def test_seo_title(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')
@@ -809,6 +800,21 @@ def make_page_and_compare_seo(slug, content, aught_preview):
' <a href="blah">A link</a> is also &lt;cool&gt;</p>',
'I am awesome. A link is also cool')
+
+class DocumentEditingTests(TestCaseBase):
+ """Tests for the document-editing view"""
+
+ fixtures = ['test_users.json']
+
+ def test_noindex_post(self):
+ client = LocalizingClient()
+ client.login(username='admin', password='testpass')
+
+ # Go to new document page to ensure no-index header works
+ response = client.get(reverse('wiki.new_document', args=[],
+ locale=settings.WIKI_DEFAULT_LANGUAGE))
+ eq_(response['X-Robots-Tag'], 'noindex')
+
def test_create_on_404(self):
client = LocalizingClient()
client.login(username='admin', password='testpass')
View
46 apps/wiki/views.py
@@ -78,7 +78,7 @@
DOCUMENT_LAST_MODIFIED_CACHE_KEY_TMPL,
get_current_or_latest_revision)
from wiki.tasks import send_reviewed_notification, schedule_rebuild_kb
-from wiki.helpers import format_comment
+from wiki.helpers import format_comment, get_seo_description
import wiki.content
from wiki import kumascript
@@ -303,48 +303,6 @@ def _get_document_for_json(doc, addLocaleToTitle=False):
}
-def get_seo_description(content):
- # Create an SEO summary
- # TODO: Google only takes the first 180 characters, so maybe we find a
- # logical way to find the end of sentence before 180?
- seo_summary = ''
- try:
- if content:
- # Need to add a BR to the page content otherwise pyQuery wont find
- # a <p></p> element if it's the only element in the doc_html
- seo_analyze_doc_html = content + '<br />'
- page = pq(seo_analyze_doc_html)
-
- # Look for the SEO summary class first
- summaryClasses = page.find('.seoSummary')
- if len(summaryClasses):
- seo_summary = summaryClasses.text()
- else:
- paragraphs = page.find('p')
- if paragraphs.length:
- for p in range(len(paragraphs)):
- item = paragraphs.eq(p)
- text = item.text()
- # Checking for a parent length of 2
- # because we don't want p's wrapped
- # in DIVs ("<div class='warning'>") and pyQuery adds
- # "<html><div>" wrapping to entire document
- if (len(text) and
- not 'Redirect' in text and
- text.find(u'«') == -1 and
- text.find('&laquo') == -1 and
- item.parents().length == 2):
- seo_summary = text.strip()
- break
- except:
- pass
-
- # Post-found cleanup
- seo_summary = seo_summary.replace('<', '').replace('>', '')
-
- return seo_summary
-
-
@csrf_exempt
@require_http_methods(['GET', 'PUT', 'HEAD'])
@accepts_auth_key
@@ -578,7 +536,7 @@ def set_common_headers(r):
# Get the SEO summary
seo_summary = ''
if not doc.is_template:
- seo_summary = get_seo_description(doc_html)
+ seo_summary = get_seo_description(doc_html, doc.locale)
# Get the additional title information, if necessary
seo_parent_title = ''

0 comments on commit 87c3261

Please sign in to comment.