From ddacb310a2d50472e50acbf0b0d3113e28a4b60e Mon Sep 17 00:00:00 2001 From: Sam <7800766+pigeonhands@users.noreply.github.com> Date: Wed, 29 Mar 2023 12:15:25 +0000 Subject: [PATCH] added EntityResolver to Deserializer --- src/de/mod.rs | 58 +++++++++++++++++++++++++++++++++++++++-------- src/escapei.rs | 4 ++-- src/events/mod.rs | 2 +- src/lib.rs | 1 + src/resolver.rs | 19 ++++++++++++++++ 5 files changed, 71 insertions(+), 13 deletions(-) create mode 100644 src/resolver.rs diff --git a/src/de/mod.rs b/src/de/mod.rs index 01c5d6b2..da381dad 100644 --- a/src/de/mod.rs +++ b/src/de/mod.rs @@ -1842,7 +1842,8 @@ use crate::{ errors::Error, events::{BytesCData, BytesEnd, BytesStart, BytesText, Event}, name::QName, - reader::Reader, + reader::Reader, + resolver::{EntityResolver, DefaultEntityResolver}, }; use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor}; use std::borrow::Cow; @@ -1954,10 +1955,11 @@ impl<'a> PayloadEvent<'a> { } } + /// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s. /// [`PayloadEvent::Text`] events, that followed by any event except /// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end. -struct XmlReader<'i, R: XmlRead<'i>> { +struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver<'i> = DefaultEntityResolver> { /// A source of low-level XML events reader: R, /// Intermediate event, that could be returned by the next call to `next()`. @@ -1965,15 +1967,22 @@ struct XmlReader<'i, R: XmlRead<'i>> { /// trailing spaces is not. Before the event will be returned, trimming of /// the spaces could be necessary lookahead: Result, DeError>, + + entity_resolver: E } -impl<'i, R: XmlRead<'i>> XmlReader<'i, R> { - fn new(mut reader: R) -> Self { +impl<'i, R: XmlRead<'i>, E: EntityResolver<'i>> XmlReader<'i, R, E> { + fn new(reader: R) -> Self + where E: Default { + Self::with_resolver(reader, E::default()) + } + + fn with_resolver(mut reader: R, entity_resolver: E) -> Self { // Lookahead by one event immediately, so we do not need to check in the // loop if we need lookahead or not let lookahead = reader.next(); - Self { reader, lookahead } + Self { reader, lookahead, entity_resolver } } /// Read next event and put it in lookahead, return the current lookahead @@ -2029,7 +2038,7 @@ impl<'i, R: XmlRead<'i>> XmlReader<'i, R> { if self.need_trim_end() { e.inplace_trim_end(); } - Ok(e.unescape()?) + Ok(e.unescape_with(|ent: &str| self.entity_resolver.resolve_entity(ent))?) } PayloadEvent::CData(e) => Ok(e.decode()?), @@ -2167,12 +2176,12 @@ where //////////////////////////////////////////////////////////////////////////////////////////////////// /// A structure that deserializes XML into Rust values. -pub struct Deserializer<'de, R> +pub struct Deserializer<'de, R, S: EntityResolver<'de> = DefaultEntityResolver> where R: XmlRead<'de>, { /// An XML reader that streams events into this deserializer - reader: XmlReader<'de, R>, + reader: XmlReader<'de, R, S>, /// When deserializing sequences sometimes we have to skip unwanted events. /// That events should be stored and then replayed. This is a replay buffer, @@ -2557,17 +2566,46 @@ where /// instead, because it will borrow instead of copy. If you have `&[u8]` which /// is known to represent UTF-8, you can decode it first before using [`from_str`]. pub fn from_reader(reader: R) -> Self { + Self::with_resolver(reader, DefaultEntityResolver) + } +} + + +impl<'de, R, E: EntityResolver<'de>> Deserializer<'de, IoReader, E> +where + R: BufRead, +{ + /// Create new deserializer that will copy data from the specified reader + /// into internal buffer. If you already have a string use [`Self::from_str`] + /// instead, because it will borrow instead of copy. If you have `&[u8]` which + /// is known to represent UTF-8, you can decode it first before using [`from_str`]. + pub fn with_resolver(reader: R, entity_resolver: E) -> Self { let mut reader = Reader::from_reader(reader); reader.expand_empty_elements(true).check_end_names(true); - Self::new(IoReader { + let io_reader = IoReader { reader, start_trimmer: StartTrimmer::default(), buf: Vec::new(), - }) + }; + + Self { + reader: XmlReader::with_resolver(io_reader, entity_resolver), + + #[cfg(feature = "overlapped-lists")] + read: VecDeque::new(), + #[cfg(feature = "overlapped-lists")] + write: VecDeque::new(), + #[cfg(feature = "overlapped-lists")] + limit: None, + + #[cfg(not(feature = "overlapped-lists"))] + peek: None, + } } } + impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R> where R: XmlRead<'de>, diff --git a/src/escapei.rs b/src/escapei.rs index ce779f09..7ca5da46 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -159,11 +159,11 @@ pub fn unescape(raw: &str) -> Result, EscapeError> { /// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref pub fn unescape_with<'input, 'entity, F>( raw: &'input str, - resolve_entity: F, + mut resolve_entity: F, ) -> Result, EscapeError> where // the lifetime of the output comes from a capture or is `'static` - F: Fn(&str) -> Option<&'entity str>, + F: FnMut(&str) -> Option<&'entity str>, { let bytes = raw.as_bytes(); let mut unescaped = None; diff --git a/src/events/mod.rs b/src/events/mod.rs index 605245b5..4e9a5248 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -740,7 +740,7 @@ impl<'a> BytesText<'a> { /// non-UTF-8 encoding. pub fn unescape_with<'entity>( &self, - resolve_entity: impl Fn(&str) -> Option<&'entity str>, + resolve_entity: impl FnMut(&str) -> Option<&'entity str>, ) -> Result> { let decoded = match &self.content { Cow::Borrowed(bytes) => self.decoder.decode(bytes)?, diff --git a/src/lib.rs b/src/lib.rs index 5d5d51c4..87d259ab 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,6 +51,7 @@ #[cfg(feature = "serialize")] pub mod de; pub mod encoding; +pub mod resolver; mod errors; mod escapei; pub mod escape { diff --git a/src/resolver.rs b/src/resolver.rs new file mode 100644 index 00000000..0f46e095 --- /dev/null +++ b/src/resolver.rs @@ -0,0 +1,19 @@ +//! Entity resolver module +//! + +/// Used to resolve unknown entities while parsing +pub trait EntityResolver<'entity_out> { + /// Called when an entity needs to be resolved. + /// None is retuned if a sutable value can not be found. + fn resolve_entity(&mut self, entity: &str) -> Option<&'entity_out str>; +} + +/// An EntityResolver that always returns None. +#[derive(Default, Copy, Clone)] +pub struct DefaultEntityResolver; + +impl<'entity_out> EntityResolver<'entity_out> for DefaultEntityResolver{ + fn resolve_entity(&mut self, _: &str) -> Option<&'entity_out str> { + None + } +}