Merge pull request #857 from groovecoder/seo-meta-html-spaces-840550

fix bug 840550 - fix PyQuery punctuation spacing
mdn · Feb 12, 2013 · 0e269c8 · 0e269c8
2 parents fc70f2e + b178b59
commit 0e269c8
Show file tree

Hide file tree

Showing 4 changed files with 150 additions and 84 deletions.
diff --git a/apps/wiki/helpers.py b/apps/wiki/helpers.py
@@ -1,10 +1,13 @@
+# coding=utf-8
+
 import difflib
 import re
 import urllib
 
 import constance.config
 from jingo import register
 import jinja2
+from pyquery import PyQuery as pq
 from tidylib import tidy_document
 from tower import ugettext as _
 import logging
@@ -13,6 +16,53 @@
 from wiki import DIFF_WRAP_COLUMN
 
 
+def get_seo_description(content, locale=None):
+    # Create an SEO summary
+    # TODO:  Google only takes the first 180 characters, so maybe we find a
+    #        logical way to find the end of sentence before 180?
+    seo_summary = ''
+    try:
+        if content:
+            # Need to add a BR to the page content otherwise pyQuery wont find
+            # a <p></p> element if it's the only element in the doc_html
+            seo_analyze_doc_html = content + '<br />'
+            page = pq(seo_analyze_doc_html)
+
+            # Look for the SEO summary class first
+            summaryClasses = page.find('.seoSummary')
+            if len(summaryClasses):
+                seo_summary = summaryClasses.text()
+            else:
+                paragraphs = page.find('p')
+                if paragraphs.length:
+                    for p in range(len(paragraphs)):
+                        item = paragraphs.eq(p)
+                        text = item.text()
+                        # Checking for a parent length of 2
+                        # because we don't want p's wrapped
+                        # in DIVs ("<div class='warning'>") and pyQuery adds
+                        # "<html><div>" wrapping to entire document
+                        if (len(text) and
+                            not 'Redirect' in text and
+                            text.find(u'«') == -1 and
+                            text.find('&laquo') == -1 and
+                            item.parents().length == 2):
+                            seo_summary = text.strip()
+                            break
+    except:
+        pass
+
+    # Post-found cleanup
+    # remove markup chars
+    seo_summary = seo_summary.replace('<', '').replace('>', '')
+    # remove spaces around some punctuation added by PyQuery
+    if locale == 'en-US':
+        seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
+        seo_summary = re.sub(r'(\() ', r'\1', seo_summary)
+
+    return seo_summary
+
+
 def compare_url(doc, from_id, to_id):
     return (reverse('wiki.compare_revisions', args=[doc.full_path],
                     locale=doc.locale)
@@ -84,10 +134,13 @@ def _massage_diff_content(content):
 def bugize_text(content):
     content = jinja2.escape(content)
     content = re.sub(r'bug\s+#?(\d+)',
-                  jinja2.Markup('<a href="https://bugzilla.mozilla.org/show_bug.cgi?id=\\1" target="_blank">bug \\1</a>'),
+                  jinja2.Markup('<a href="https://bugzilla.mozilla.org/'
+                                'show_bug.cgi?id=\\1" '
+                                'target="_blank">bug \\1</a>'),
                   content)
     return content
 
+
 @register.function
 def format_comment(rev):
     """ Massages revision comment content after the fact """
@@ -97,7 +150,10 @@ def format_comment(rev):
 
     #  If a page move, say so
     if prev_rev and prev_rev.slug != rev.slug:
-        comment += jinja2.Markup('<span class="slug-change">Moved From <strong>%s</strong> to <strong>%s</strong></span>') % (prev_rev.slug, rev.slug)
+        comment += jinja2.Markup('<span class="slug-change">'
+                                 'Moved From <strong>%s</strong> '
+                                 'to <strong>%s</strong></span>') % (
+                                     prev_rev.slug, rev.slug)
 
     return comment
 
@@ -112,11 +168,11 @@ def diff_table(content_from, content_to, prev_id, curr_id):
     to_lines = tidy_to.splitlines()
     try:
         diff = html_diff.make_table(from_lines, to_lines,
-                                    _("Revision %s") % prev_id,
-                                    _("Revision %s") % curr_id,
-                                    context=True,
-                                    numlines=constance.config.DIFF_CONTEXT_LINES
-                                   )
+                                _("Revision %s") % prev_id,
+                                _("Revision %s") % curr_id,
+                                context=True,
+                                numlines=constance.config.DIFF_CONTEXT_LINES
+                               )
     except RuntimeError:
         # some diffs hit a max recursion error
         message = _(u'There was an error generating the content.')

diff --git a/apps/wiki/tests/test_helpers.py b/apps/wiki/tests/test_helpers.py
@@ -0,0 +1,50 @@
+from nose.tools import eq_
+from test_utils import TestCase
+
+from wiki.helpers import get_seo_description
+
+
+class GetSEODescriptionTests(TestCase):
+
+    def test_html_elements_spaces(self):
+        # No spaces with html tags
+        content = (u'<p><span class="seoSummary">The <strong>Document Object '
+             'Model'
+             '</strong> (<strong>DOM</strong>) is an API for '
+             '<a href="/en-US/docs/HTML" title="en-US/docs/HTML">HTML</a> and '
+             '<a href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> '
+             'documents. It provides a structural representation of the '
+             'document, enabling you to modify its content and visual '
+             'presentation by using a scripting language such as '
+             '<a href="/en-US/docs/JavaScript" '
+             'title="https://developer.mozilla.org/en-US/docs/JavaScript">'
+             'JavaScript</a>.</span></p>')
+        expected = ('The Document Object Model (DOM) is an API for HTML and '
+            'XML'
+            ' documents. It provides a structural representation of the'
+            ' document, enabling you to modify its content and visual'
+            ' presentation by using a scripting language such as'
+            ' JavaScript.')
+        eq_(expected, get_seo_description(content, 'en-US'))
+
+        content = (u'<p><span class="seoSummary"><strong>Cascading Style '
+                   'Sheets</strong>, most of the time abbreviated in '
+                   '<strong>CSS</strong>, is a '
+                   '<a href="/en-US/docs/DOM/stylesheet">stylesheet</a> '
+                   'language used to describe the presentation of a document '
+                   'written in <a href="/en-US/docs/HTML" title="The '
+                   'HyperText Mark-up Language">HTML</a></span> or <a '
+                   'href="/en-US/docs/XML" title="en-US/docs/XML">XML</a> '
+                   '(including various XML languages like <a '
+                   'href="/en-US/docs/SVG" title="en-US/docs/SVG">SVG</a> or '
+                   '<a href="/en-US/docs/XHTML" '
+                   'title="en-US/docs/XHTML">XHTML</a>)<span '
+                   'class="seoSummary">. CSS describes how the structured '
+                   'element must be rendered on screen, on paper, in speech, '
+                   'or on other media.</span></p>')
+        expected = ('Cascading Style Sheets, most of the time abbreviated in '
+                    'CSS, is a stylesheet language used to describe the '
+                    'presentation of a document written in HTML. CSS '
+                    'describes how the structured element must be rendered on '
+                    'screen, on paper, in speech, or on other media.')
+        eq_(expected, get_seo_description(content, 'en-US'))
diff --git a/apps/wiki/tests/test_views.py b/apps/wiki/tests/test_views.py
@@ -711,20 +711,11 @@ def my_post(url, timeout=None, headers=None, data=None):
             ok_(False, "Data wasn't posted as utf8")
 
 
-class DocumentEditingTests(TestCaseBase):
-    """Tests for the document-editing view"""
+class DocumentSEOTests(TestCaseBase):
+    """Tests for the document seo logic"""
 
     fixtures = ['test_users.json']
 
-    def test_noindex_post(self):
-        client = LocalizingClient()
-        client.login(username='admin', password='testpass')
-
-        # Go to new document page to ensure no-index header works
-        response = client.get(reverse('wiki.new_document', args=[],
-                                       locale=settings.WIKI_DEFAULT_LANGUAGE))
-        eq_(response['X-Robots-Tag'], 'noindex')
-
     def test_seo_title(self):
         client = LocalizingClient()
         client.login(username='admin', password='testpass')
@@ -809,6 +800,21 @@ def make_page_and_compare_seo(slug, content, aught_preview):
               ' <a href="blah">A link</a> is also &lt;cool&gt;</p>',
           'I am awesome. A link is also cool')
 
+
+class DocumentEditingTests(TestCaseBase):
+    """Tests for the document-editing view"""
+
+    fixtures = ['test_users.json']
+
+    def test_noindex_post(self):
+        client = LocalizingClient()
+        client.login(username='admin', password='testpass')
+
+        # Go to new document page to ensure no-index header works
+        response = client.get(reverse('wiki.new_document', args=[],
+                                       locale=settings.WIKI_DEFAULT_LANGUAGE))
+        eq_(response['X-Robots-Tag'], 'noindex')
+
     def test_create_on_404(self):
         client = LocalizingClient()
         client.login(username='admin', password='testpass')
@@ -1323,19 +1329,28 @@ def test_invalid_slug_translate(inv_slug, url, data):
             def _run_translate_edit_tests(edit_slug, edit_data, edit_doc):
 
                 # Hit the initial URL
-                response = client.get(reverse('wiki.edit_document', args=[edit_doc.slug], locale=foreign_locale))
+                response = client.get(reverse('wiki.edit_document',
+                                              args=[edit_doc.slug],
+                                              locale=foreign_locale))
                 eq_(200, response.status_code)
                 page = pq(response.content)
                 eq_(edit_data['slug'], page.find('input[name=slug]')[0].value)
 
-                # Attempt an invalid edit of the root, ensure the slug stays the same (i.e. no parent prepending)
+                # Attempt an invalid edit of the root, ensure the slug stays
+                # the same (i.e. no parent prepending)
                 edit_data['slug'] = invalid_slug
                 edit_data['form'] = 'both'
-                response = client.post(reverse('wiki.edit_document', args=[edit_doc.slug], locale=foreign_locale), edit_data)
+                response = client.post(reverse('wiki.edit_document',
+                                               args=[edit_doc.slug],
+                                               locale=foreign_locale),
+                                       edit_data)
                 eq_(200, response.status_code)  # 200 = bad, invalid data
                 page = pq(response.content)
-                eq_(invalid_slug, page.find('input[name=slug]')[0].value)  # Slug doesn't add parent
-                self.assertContains(response, page.find('ul.errorlist li a[href="#id_slug"]').text())
+                # Slug doesn't add parent
+                eq_(invalid_slug, page.find('input[name=slug]')[0].value)
+                self.assertContains(response, page.find('ul.errorlist li'
+                                                        ' a[href="#id_slug"]').
+                                    text())
                 eq_(0, len(Document.objects.filter(title=edit_data['title'] + ' Redirect 1', locale=foreign_locale)))  # Ensure no redirect
 
                 # Push a valid edit, without changing the slug

diff --git a/apps/wiki/views.py b/apps/wiki/views.py
@@ -1,11 +1,7 @@
 # coding=utf-8
 
 from datetime import datetime
-import time
 import json
-from collections import defaultdict
-import base64
-import httplib
 import hashlib
 import logging
 from urllib import urlencode
@@ -16,11 +12,6 @@
 except:
     from StringIO import StringIO
 
-import requests
-import bleach
-
-from taggit.utils import parse_tags, edit_string_for_tags
-
 try:
     from functools import wraps
 except ImportError:
@@ -44,7 +35,6 @@
 import constance.config
 
 from waffle.decorators import waffle_flag
-from waffle import flag_is_active
 
 import jingo
 from tower import ugettext_lazy as _lazy
@@ -57,9 +47,9 @@
 
 from access.decorators import permission_required, login_required
 from sumo.helpers import urlparams
-from sumo.urlresolvers import Prefixer, reverse
+from sumo.urlresolvers import reverse
 from sumo.utils import paginate, smart_int
-from wiki import (DOCUMENTS_PER_PAGE, TEMPLATE_TITLE_PREFIX, ReadOnlyException)
+from wiki import (DOCUMENTS_PER_PAGE, TEMPLATE_TITLE_PREFIX)
 from wiki.decorators import check_readonly
 from wiki.events import (EditDocumentEvent, ReviewableRevisionInLocaleEvent,
                          ApproveRevisionInLocaleEvent)
@@ -68,24 +58,21 @@
                         TreeMoveForm)
 from wiki.models import (Document, Revision, HelpfulVote, EditorToolbar,
                          DocumentTag, ReviewTag, Attachment,
-                         DocumentRenderingInProgress,
                          DocumentRenderedContentNotAvailable,
                          CATEGORIES,
                          OPERATING_SYSTEMS, GROUPED_OPERATING_SYSTEMS,
                          FIREFOX_VERSIONS, GROUPED_FIREFOX_VERSIONS,
-                         REVIEW_FLAG_TAGS_DEFAULT, ALLOWED_ATTRIBUTES,
-                         ALLOWED_TAGS, ALLOWED_STYLES,
+                         REVIEW_FLAG_TAGS_DEFAULT,
                          DOCUMENT_LAST_MODIFIED_CACHE_KEY_TMPL,
                          get_current_or_latest_revision)
-from wiki.tasks import send_reviewed_notification, schedule_rebuild_kb
-from wiki.helpers import format_comment
+from wiki.tasks import send_reviewed_notification
+from wiki.helpers import format_comment, get_seo_description
 import wiki.content
 from wiki import kumascript
 
 from pyquery import PyQuery as pq
 from django.utils.safestring import mark_safe
 
-import logging
 
 log = logging.getLogger('k.wiki')
 
@@ -303,48 +290,6 @@ def _get_document_for_json(doc, addLocaleToTitle=False):
     }
 
 
-def get_seo_description(content):
-    # Create an SEO summary
-    # TODO:  Google only takes the first 180 characters, so maybe we find a
-    #        logical way to find the end of sentence before 180?
-    seo_summary = ''
-    try:
-        if content:
-            # Need to add a BR to the page content otherwise pyQuery wont find
-            # a <p></p> element if it's the only element in the doc_html
-            seo_analyze_doc_html = content + '<br />'
-            page = pq(seo_analyze_doc_html)
-
-            # Look for the SEO summary class first
-            summaryClasses = page.find('.seoSummary')
-            if len(summaryClasses):
-                seo_summary = summaryClasses.text()
-            else:
-                paragraphs = page.find('p')
-                if paragraphs.length:
-                    for p in range(len(paragraphs)):
-                        item = paragraphs.eq(p)
-                        text = item.text()
-                        # Checking for a parent length of 2
-                        # because we don't want p's wrapped
-                        # in DIVs ("<div class='warning'>") and pyQuery adds
-                        # "<html><div>" wrapping to entire document
-                        if (len(text) and
-                            not 'Redirect' in text and
-                            text.find(u'«') == -1 and
-                            text.find('&laquo') == -1 and
-                            item.parents().length == 2):
-                            seo_summary = text.strip()
-                            break
-    except:
-        pass
-
-    # Post-found cleanup
-    seo_summary = seo_summary.replace('<', '').replace('>', '')
-
-    return seo_summary
-
-
 @csrf_exempt
 @require_http_methods(['GET', 'PUT', 'HEAD'])
 @accepts_auth_key
@@ -578,7 +523,7 @@ def set_common_headers(r):
     # Get the SEO summary
     seo_summary = ''
     if not doc.is_template:
-        seo_summary = get_seo_description(doc_html)
+        seo_summary = get_seo_description(doc_html, doc.locale)
 
     # Get the additional title information, if necessary
     seo_parent_title = ''