Skip to content

Commit

Permalink
added EntityResolver to Deserializer
Browse files Browse the repository at this point in the history
  • Loading branch information
pigeonhands committed Mar 29, 2023
1 parent 2e9123a commit ddacb31
Show file tree
Hide file tree
Showing 5 changed files with 71 additions and 13 deletions.
58 changes: 48 additions & 10 deletions src/de/mod.rs
Expand Up @@ -1842,7 +1842,8 @@ use crate::{
errors::Error,
events::{BytesCData, BytesEnd, BytesStart, BytesText, Event},
name::QName,
reader::Reader,
reader::Reader,
resolver::{EntityResolver, DefaultEntityResolver},
};
use serde::de::{self, Deserialize, DeserializeOwned, DeserializeSeed, SeqAccess, Visitor};
use std::borrow::Cow;
Expand Down Expand Up @@ -1954,26 +1955,34 @@ impl<'a> PayloadEvent<'a> {
}
}


/// An intermediate reader that consumes [`PayloadEvent`]s and produces final [`DeEvent`]s.
/// [`PayloadEvent::Text`] events, that followed by any event except
/// [`PayloadEvent::Text`] or [`PayloadEvent::CData`], are trimmed from the end.
struct XmlReader<'i, R: XmlRead<'i>> {
struct XmlReader<'i, R: XmlRead<'i>, E: EntityResolver<'i> = DefaultEntityResolver> {
/// A source of low-level XML events
reader: R,
/// Intermediate event, that could be returned by the next call to `next()`.
/// If that is the `Text` event then leading spaces already trimmed, but
/// trailing spaces is not. Before the event will be returned, trimming of
/// the spaces could be necessary
lookahead: Result<PayloadEvent<'i>, DeError>,

entity_resolver: E
}

impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
fn new(mut reader: R) -> Self {
impl<'i, R: XmlRead<'i>, E: EntityResolver<'i>> XmlReader<'i, R, E> {
fn new(reader: R) -> Self
where E: Default {
Self::with_resolver(reader, E::default())
}

fn with_resolver(mut reader: R, entity_resolver: E) -> Self {
// Lookahead by one event immediately, so we do not need to check in the
// loop if we need lookahead or not
let lookahead = reader.next();

Self { reader, lookahead }
Self { reader, lookahead, entity_resolver }
}

/// Read next event and put it in lookahead, return the current lookahead
Expand Down Expand Up @@ -2029,7 +2038,7 @@ impl<'i, R: XmlRead<'i>> XmlReader<'i, R> {
if self.need_trim_end() {
e.inplace_trim_end();
}
Ok(e.unescape()?)
Ok(e.unescape_with(|ent: &str| self.entity_resolver.resolve_entity(ent))?)
}
PayloadEvent::CData(e) => Ok(e.decode()?),

Expand Down Expand Up @@ -2167,12 +2176,12 @@ where
////////////////////////////////////////////////////////////////////////////////////////////////////

/// A structure that deserializes XML into Rust values.
pub struct Deserializer<'de, R>
pub struct Deserializer<'de, R, S: EntityResolver<'de> = DefaultEntityResolver>
where
R: XmlRead<'de>,
{
/// An XML reader that streams events into this deserializer
reader: XmlReader<'de, R>,
reader: XmlReader<'de, R, S>,

/// When deserializing sequences sometimes we have to skip unwanted events.
/// That events should be stored and then replayed. This is a replay buffer,
Expand Down Expand Up @@ -2557,17 +2566,46 @@ where
/// instead, because it will borrow instead of copy. If you have `&[u8]` which
/// is known to represent UTF-8, you can decode it first before using [`from_str`].
pub fn from_reader(reader: R) -> Self {
Self::with_resolver(reader, DefaultEntityResolver)
}
}


impl<'de, R, E: EntityResolver<'de>> Deserializer<'de, IoReader<R>, E>
where
R: BufRead,
{
/// Create new deserializer that will copy data from the specified reader
/// into internal buffer. If you already have a string use [`Self::from_str`]
/// instead, because it will borrow instead of copy. If you have `&[u8]` which
/// is known to represent UTF-8, you can decode it first before using [`from_str`].
pub fn with_resolver(reader: R, entity_resolver: E) -> Self {
let mut reader = Reader::from_reader(reader);
reader.expand_empty_elements(true).check_end_names(true);

Self::new(IoReader {
let io_reader = IoReader {
reader,
start_trimmer: StartTrimmer::default(),
buf: Vec::new(),
})
};

Self {
reader: XmlReader::with_resolver(io_reader, entity_resolver),

#[cfg(feature = "overlapped-lists")]
read: VecDeque::new(),
#[cfg(feature = "overlapped-lists")]
write: VecDeque::new(),
#[cfg(feature = "overlapped-lists")]
limit: None,

#[cfg(not(feature = "overlapped-lists"))]
peek: None,
}
}
}


impl<'de, 'a, R> de::Deserializer<'de> for &'a mut Deserializer<'de, R>
where
R: XmlRead<'de>,
Expand Down
4 changes: 2 additions & 2 deletions src/escapei.rs
Expand Up @@ -159,11 +159,11 @@ pub fn unescape(raw: &str) -> Result<Cow<str>, EscapeError> {
/// [HTML5 escapes]: https://dev.w3.org/html5/html-author/charref
pub fn unescape_with<'input, 'entity, F>(
raw: &'input str,
resolve_entity: F,
mut resolve_entity: F,
) -> Result<Cow<'input, str>, EscapeError>
where
// the lifetime of the output comes from a capture or is `'static`
F: Fn(&str) -> Option<&'entity str>,
F: FnMut(&str) -> Option<&'entity str>,
{
let bytes = raw.as_bytes();
let mut unescaped = None;
Expand Down
2 changes: 1 addition & 1 deletion src/events/mod.rs
Expand Up @@ -740,7 +740,7 @@ impl<'a> BytesText<'a> {
/// non-UTF-8 encoding.
pub fn unescape_with<'entity>(
&self,
resolve_entity: impl Fn(&str) -> Option<&'entity str>,
resolve_entity: impl FnMut(&str) -> Option<&'entity str>,
) -> Result<Cow<'a, str>> {
let decoded = match &self.content {
Cow::Borrowed(bytes) => self.decoder.decode(bytes)?,
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Expand Up @@ -51,6 +51,7 @@
#[cfg(feature = "serialize")]
pub mod de;
pub mod encoding;
pub mod resolver;
mod errors;
mod escapei;
pub mod escape {
Expand Down
19 changes: 19 additions & 0 deletions src/resolver.rs
@@ -0,0 +1,19 @@
//! Entity resolver module
//!

/// Used to resolve unknown entities while parsing
pub trait EntityResolver<'entity_out> {
/// Called when an entity needs to be resolved.
/// None is retuned if a sutable value can not be found.
fn resolve_entity(&mut self, entity: &str) -> Option<&'entity_out str>;
}

/// An EntityResolver that always returns None.
#[derive(Default, Copy, Clone)]
pub struct DefaultEntityResolver;

impl<'entity_out> EntityResolver<'entity_out> for DefaultEntityResolver{
fn resolve_entity(&mut self, _: &str) -> Option<&'entity_out str> {
None
}
}

0 comments on commit ddacb31

Please sign in to comment.