Skip to content

Commit

Permalink
Fixed issue #5 from GitHub: "Exception on certain URL's - {"The prefi…
Browse files Browse the repository at this point in the history
…x '' cannot be redefined from '' to 'http://www.w3.org/1999/xhtml' within the same start element tag."}".
  • Loading branch information
marek-stoj committed Apr 17, 2012
1 parent b2231f5 commit e85013e
Show file tree
Hide file tree
Showing 3 changed files with 55 additions and 42 deletions.
2 changes: 1 addition & 1 deletion NReadability.build
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8" ?>
<project name="NReadability" default="all">

<property name="nreadability.version" value="1.4.1.0" />
<property name="nreadability.version" value="1.4.2.0" />
<property name="msbuild.path" value="C:\Windows\Microsoft.NET\Framework\v3.5\MSBuild.exe" />
<property name="git.path" value="C:\Program Files (x86)\Git\bin\git.exe" />
<property name="nuget.path" value="C:\Programs\NuGet\NuGet.exe" />
Expand Down
2 changes: 1 addition & 1 deletion NReadability.nuspec
Expand Up @@ -2,7 +2,7 @@
<package >
<metadata>
<id>NReadability</id>
<version>1.4.1</version>
<version>1.4.2</version>
<authors>Marek Stój</authors>
<owners>Immortal</owners>
<projectUrl>https://github.com/marek-stoj/NReadability</projectUrl>
Expand Down
93 changes: 53 additions & 40 deletions Src/NReadability/NReadability/NReadabilityTranscoder.cs
Expand Up @@ -264,7 +264,13 @@ public string Transcode(string htmlContent, string url, DomSerializationParams d
{
string extractedTitle;

var document = TranscodeToXml(htmlContent, url, out mainContentExtracted, out extractedTitle, out nextPageUrl);
XDocument document =
TranscodeToXml(
htmlContent,
url,
out mainContentExtracted,
out extractedTitle,
out nextPageUrl);

return _sgmlDomSerializer.SerializeDocument(document, domSerializationParams);
}
Expand Down Expand Up @@ -332,7 +338,7 @@ internal XDocument TranscodeToXml(string htmlContent, string url, out bool mainC
throw new ArgumentNullException("htmlContent");
}

var document = _sgmlDomBuilder.BuildDocument(htmlContent);
XDocument document = _sgmlDomBuilder.BuildDocument(htmlContent);

PrepareDocument(document);

Expand All @@ -349,8 +355,8 @@ internal XDocument TranscodeToXml(string htmlContent, string url, out bool mainC
nextPageUrl = FindNextPageLink(document.GetBody(), url);
}

var articleTitleElement = ExtractArticleTitle(document);
var articleContentElement = ExtractArticleContent(document);
XElement articleTitleElement = ExtractArticleTitle(document);
XElement articleContentElement = ExtractArticleContent(document);

GlueDocument(document, articleTitleElement, articleContentElement);

Expand Down Expand Up @@ -560,7 +566,7 @@ internal string FindNextPageLink(XElement body, string url)
*/
LinkData topPage = null;

foreach (var page in possiblePagesByLink.Keys)
foreach (string page in possiblePagesByLink.Keys)
{
if (possiblePagesByLink[page].Score >= 50 && (topPage == null || topPage.Score < possiblePagesByLink[page].Score))
{
Expand Down Expand Up @@ -594,7 +600,7 @@ internal string FindBaseUrl(string url)
string protocol = urlUri.Scheme;
string hostname = urlUri.Host;
string noUrlParams = urlUri.AbsolutePath + "/";
var urlSlashes = noUrlParams.Split('/').Reverse().ToList();
List<string> urlSlashes = noUrlParams.Split('/').Reverse().ToList();
var cleanedSegments = new List<string>();
int slashLen = urlSlashes.Count();

Expand Down Expand Up @@ -664,8 +670,8 @@ internal void PrepareDocument(XDocument document)
{
/* In some cases a body element can't be found (if the HTML is totally hosed for example),
* so we create a new body element and append it to the document. */
var documentBody = GetOrCreateBody(document);
var rootElement = document.Root;
XElement documentBody = GetOrCreateBody(document);
XElement rootElement = document.Root;

// TODO: handle HTML frames

Expand Down Expand Up @@ -717,7 +723,7 @@ internal void PrepareDocument(XDocument document)

internal XElement ExtractArticleTitle(XDocument document)
{
var documentBody = GetOrCreateBody(document);
XElement documentBody = GetOrCreateBody(document);
string documentTitle = document.GetTitle() ?? "";
string currentTitle = documentTitle;

Expand Down Expand Up @@ -780,7 +786,7 @@ internal XElement ExtractArticleContent(XDocument document)
StripUnlikelyCandidates(document);
CollapseRedundantParagraphDivs(document);

var candidatesForArticleContent = FindCandidatesForArticleContent(document);
IEnumerable<XElement> candidatesForArticleContent = FindCandidatesForArticleContent(document);

XElement topCandidateElement = DetermineTopCandidateElement(document, candidatesForArticleContent);
XElement articleContentElement = CreateArticleContentElement(document, topCandidateElement);
Expand All @@ -792,22 +798,22 @@ internal XElement ExtractArticleContent(XDocument document)

internal void GlueDocument(XDocument document, XElement articleTitleElement, XElement articleContentElement)
{
var documentBody = GetOrCreateBody(document);
XElement documentBody = GetOrCreateBody(document);

/* Include readability.css stylesheet. */
var headElement = document.GetElementsByTagName("head").FirstOrDefault();
XElement headElement = document.GetElementsByTagName("head").FirstOrDefault();

if (headElement == null)
{
headElement = new XElement("head");
documentBody.AddBeforeSelf(headElement);
}

var styleElement = new XElement("style");
XElement styleElement = new XElement("style");

styleElement.SetAttributeValue("type", "text/css");

var readabilityStylesheetStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(_ReadabilityStylesheetResourceName);
Stream readabilityStylesheetStream = Assembly.GetExecutingAssembly().GetManifestResourceStream(_ReadabilityStylesheetResourceName);

if (readabilityStylesheetStream == null)
{
Expand Down Expand Up @@ -862,7 +868,7 @@ internal void StripUnlikelyCandidates(XDocument document)
return;
}

var rootElement = document.Root;
XElement rootElement = document.Root;

new ElementsTraverser(
element =>
Expand All @@ -878,7 +884,7 @@ internal void StripUnlikelyCandidates(XDocument document)
&& _UnlikelyCandidatesRegex.IsMatch(unlikelyMatchString)
&& !_OkMaybeItsACandidateRegex.IsMatch(unlikelyMatchString))
{
var parentElement = element.Parent;
XElement parentElement = element.Parent;
if (parentElement != null)
{
Expand All @@ -895,7 +901,7 @@ internal void StripUnlikelyCandidates(XDocument document)
if (!_DivToPElementsRegex.IsMatch(element.GetInnerHtml()))
{
// no block elements inside - change to p
element.Name = "p";
SetElementName(element, "p");
}
else
{
Expand All @@ -909,7 +915,7 @@ internal void StripUnlikelyCandidates(XDocument document)
return;
}
var paraElement = new XElement("p");
XElement paraElement = new XElement("p");
// note that we're not using GetInnerText() here; instead we're getting raw InnerText to preserve whitespaces
paraElement.SetInnerHtml(((XText)childNode).Value);
Expand All @@ -927,7 +933,7 @@ internal void StripUnlikelyCandidates(XDocument document)

internal void CollapseRedundantParagraphDivs(XDocument document)
{
var rootElement = document.Root;
XElement rootElement = document.Root;

new ElementsTraverser(
element =>
Expand Down Expand Up @@ -960,12 +966,12 @@ internal void CollapseRedundantParagraphDivs(XDocument document)

internal IEnumerable<XElement> FindCandidatesForArticleContent(XDocument document)
{
var paraElements = document.GetElementsByTagName("p");
IEnumerable<XElement> paraElements = document.GetElementsByTagName("p");
var candidateElements = new HashSet<XElement>();

_elementsScores.Clear();

foreach (var paraElement in paraElements)
foreach (XElement paraElement in paraElements)
{
string innerText = GetInnerText(paraElement);

Expand All @@ -974,8 +980,8 @@ internal IEnumerable<XElement> FindCandidatesForArticleContent(XDocument documen
continue;
}

var parentElement = paraElement.Parent;
var grandParentElement = parentElement != null ? parentElement.Parent : null;
XElement parentElement = paraElement.Parent;
XElement grandParentElement = parentElement != null ? parentElement.Parent : null;
int score = 1; // 1 point for having a paragraph

// Add points for any comma-segments within this paragraph.
Expand Down Expand Up @@ -1006,7 +1012,7 @@ internal XElement DetermineTopCandidateElement(XDocument document, IEnumerable<X
{
XElement topCandidateElement = null;

foreach (var candidateElement in candidatesForArticleContent)
foreach (XElement candidateElement in candidatesForArticleContent)
{
float candidateScore = GetElementScore(candidateElement);

Expand All @@ -1028,7 +1034,7 @@ internal XElement DetermineTopCandidateElement(XDocument document, IEnumerable<X
{
topCandidateElement = new XElement("div");

var documentBody = GetOrCreateBody(document);
XElement documentBody = GetOrCreateBody(document);

topCandidateElement.Add(documentBody.Nodes());
}
Expand All @@ -1045,7 +1051,7 @@ internal XElement CreateArticleContentElement(XDocument document, XElement topCa

articleContentElement.SetId(ContentDivId);

var parentElement = topCandidateElement.Parent;
XElement parentElement = topCandidateElement.Parent;

if (parentElement == null)
{
Expand All @@ -1067,7 +1073,7 @@ internal XElement CreateArticleContentElement(XDocument document, XElement topCa
string topCandidateClass = topCandidateElement.GetClass();

// iterate through the sibling elements and decide whether append them
foreach (var siblingElement in siblingElements)
foreach (XElement siblingElement in siblingElements)
{
bool append = false;
string siblingElementName = GetElementName(siblingElement);
Expand Down Expand Up @@ -1167,10 +1173,10 @@ internal void PrepareArticleContentElement(XElement articleContentElement)
CleanConditionally(articleContentElement, "div");

/* Remove extra paragraphs. */
var paraElements = articleContentElement.GetElementsByTagName("p");
IEnumerable<XElement> paraElements = articleContentElement.GetElementsByTagName("p");
var elementsToRemove = new List<XElement>();

foreach (var paraElement in paraElements)
foreach (XElement paraElement in paraElements)
{
string innerText = GetInnerText(paraElement, false);
if (innerText.Length > 0) { continue; }
Expand Down Expand Up @@ -1316,12 +1322,14 @@ internal void KillBreaks(XElement element)
/// </summary>
internal void Clean(XElement rootElement, string elementName)
{
var elements = rootElement.GetElementsByTagName(elementName);
IEnumerable<XElement> elements = rootElement.GetElementsByTagName(elementName);

bool isEmbed = "object".Equals(elementName, StringComparison.OrdinalIgnoreCase)
|| "embed".Equals(elementName, StringComparison.OrdinalIgnoreCase);

var elementsToRemove = new List<XElement>();

foreach (var element in elements)
foreach (XElement element in elements)
{
/* Allow youtube and vimeo videos through as people usually want to see those. */
if (isEmbed
Expand All @@ -1348,10 +1356,10 @@ internal void CleanConditionally(XElement rootElement, string elementName)
throw new ArgumentNullException("elementName");
}

var elements = rootElement.GetElementsByTagName(elementName);
IEnumerable<XElement> elements = rootElement.GetElementsByTagName(elementName);
var elementsToRemove = new List<XElement>();

foreach (var element in elements)
foreach (XElement element in elements)
{
int weight = GetClassWeight(element);
float score = GetElementScore(element);
Expand Down Expand Up @@ -1415,9 +1423,9 @@ internal void CleanHeaders(XElement element)

for (int headerLevel = 1; headerLevel < 7; headerLevel++)
{
var headerElements = element.GetElementsByTagName("h" + headerLevel);
IEnumerable<XElement> headerElements = element.GetElementsByTagName("h" + headerLevel);

foreach (var headerElement in headerElements)
foreach (XElement headerElement in headerElements)
{
if (GetClassWeight(headerElement) < 0
|| GetLinksDensity(headerElement) > _MaxHeaderLinksDensity)
Expand Down Expand Up @@ -1487,11 +1495,11 @@ internal string GetUserStyleClass(string prefix, string enumStr)

private static XElement GetOrCreateBody(XDocument document)
{
var documentBody = document.GetBody();
XElement documentBody = document.GetBody();

if (documentBody == null)
{
var htmlElement = document.GetChildrenByTagName("html").FirstOrDefault();
XElement htmlElement = document.GetChildrenByTagName("html").FirstOrDefault();

if (htmlElement == null)
{
Expand Down Expand Up @@ -1523,11 +1531,11 @@ private static void ResolveElementsUrls(XDocument document, string tagName, stri
throw new ArgumentNullException("url");
}

var elements = document.GetElementsByTagName(tagName);
IEnumerable<XElement> elements = document.GetElementsByTagName(tagName);

foreach (var element in elements)
foreach (XElement element in elements)
{
var attributeValue = element.GetAttributeValue(attributeName, null);
string attributeValue = element.GetAttributeValue(attributeName, null);

if (attributeValue == null)
{
Expand Down Expand Up @@ -1599,6 +1607,11 @@ private static string GetElementName(XElement element)
return element.Name.LocalName ?? "";
}

private static void SetElementName(XElement element, string newLocalName)
{
element.Name = XName.Get(newLocalName, element.Name.NamespaceName);
}

private static bool ElementLooksLikeParagraphDiv(XElement element)
{
string elementName = GetElementName(element);
Expand Down

1 comment on commit e85013e

@nikhilsi
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. Thx

Please sign in to comment.