Skip to content

Commit

Permalink
Merge pull request #83 from oscar-project/dev-move-io
Browse files Browse the repository at this point in the history
Move IO out of Ungoliant
  • Loading branch information
Uinelj committed Jan 20, 2023
2 parents 9bf4dcb + e5d21d9 commit 5368804
Show file tree
Hide file tree
Showing 20 changed files with 80 additions and 890 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ unic-ucd = "0.9.0"
oxilangtag = {version="0.1.3", features=["serde"]}
language-tags = "0.3.2"
lazy_static = "1.4.0"
oscar-io = "0.1.3"
oscar-io = "0.2.0"
tlsh = {git="https://github.com/Uinelj/tlsh-rs", branch="fix-q3-panic"}
ctclib = {git="https://github.com/Uinelj/ctclib", optional=true}

Expand Down
21 changes: 21 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,66 +23,87 @@ pub enum Error {
IncompleteLocation(IncompleteLocation),
Avro(avro_rs::Error),
Csv(csv::Error),
OscarIo(oscar_io::Error),
}

#[cfg(not(tarpaulin_include))]
impl From<oscar_io::Error> for Error {
fn from(v: oscar_io::Error) -> Self {
Self::OscarIo(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<LanguageTagParseError> for Error {
fn from(v: LanguageTagParseError) -> Self {
Self::Languagetag(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<csv::Error> for Error {
fn from(v: csv::Error) -> Self {
Self::Csv(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<avro_rs::Error> for Error {
fn from(v: avro_rs::Error) -> Self {
Self::Avro(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<ut1_blocklist::Error> for Error {
fn from(v: ut1_blocklist::Error) -> Self {
Self::Ut1(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<std::io::Error> for Error {
fn from(e: std::io::Error) -> Error {
Error::Io(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<glob::GlobError> for Error {
fn from(e: glob::GlobError) -> Error {
Error::Glob(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<glob::PatternError> for Error {
fn from(e: glob::PatternError) -> Error {
Error::GlobPattern(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<warc::Error> for Error {
fn from(e: warc::Error) -> Error {
Error::Warc(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<String> for Error {
fn from(s: String) -> Error {
Error::Custom(s)
}
}

#[cfg(not(tarpaulin_include))]
impl From<FromUtf8Error> for Error {
fn from(e: FromUtf8Error) -> Error {
Error::MetadataConversion(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<serde_json::Error> for Error {
fn from(e: serde_json::Error) -> Error {
Error::Serde(e)
Expand Down
83 changes: 22 additions & 61 deletions src/identifiers/identification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,30 @@ use fasttext::Prediction;

use oxilangtag::{LanguageTag, LanguageTagParseError};

use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Identification<T: Deref<Target = str> + Clone> {
label: LanguageTag<T>,
prob: f32,
}
use oscar_io::common::Identification as IdentificationExternal;

#[derive(Debug, Clone, Serialize, Deserialize)]
struct IdentificationSer {
label: String,
prob: f32,
}
use serde::{Deserialize, Serialize};

impl<T> From<Identification<T>> for IdentificationSer
where
T: Deref<Target = str> + Clone,
{
fn from(i: Identification<T>) -> Self {
Self {
label: i.label.to_string(),
prob: i.prob,
}
/// newtype idiom over [oscar_io::Identification]
#[derive(Debug, Clone)]
pub struct Identification<T: Deref<Target = str> + Clone>(IdentificationExternal<T>);
impl<T: Deref<Target = str> + Clone> Identification<T> {
pub(crate) fn new(label: LanguageTag<T>, prob: f32) -> Identification<T> {
Self(IdentificationExternal::new(label, prob))
}
}
impl TryFrom<IdentificationSer> for Identification<String> {
type Error = LanguageTagParseError;
fn try_from(i: IdentificationSer) -> Result<Self, Self::Error> {
Ok(Self {
label: LanguageTag::parse(i.label)?,
prob: i.prob,
})

pub fn into_inner(self) -> IdentificationExternal<T> {
self.0
}
}

impl<T: Deref<Target = str> + Clone> Identification<T> {
pub fn new(label: LanguageTag<T>, prob: f32) -> Self {
Self { label, prob }
}
/// Get a reference to the identification's label.
pub fn label(&self) -> &LanguageTag<T> {
&self.label
}
impl<T: Deref<Target = str> + Clone> Deref for Identification<T> {
type Target = IdentificationExternal<T>;

/// Get a reference to the identification's prob.
pub fn prob(&self) -> &f32 {
&self.prob
fn deref(&self) -> &Self::Target {
&self.0
}
}

/// for fasttext2 predictions
impl TryFrom<Prediction> for Identification<String> {
type Error = LanguageTagParseError;
Expand All @@ -69,10 +43,10 @@ impl TryFrom<Prediction> for Identification<String> {
//convert to valid bcp47
let label = label.replace('_', "-");

Ok(Self {
prob: prediction.prob,
label: LanguageTag::parse_and_normalize(&label)?,
})
Ok(Self(IdentificationExternal::new(
LanguageTag::parse_and_normalize(&label)?,
prediction.prob,
)))
// debug!("{prediction:?}");
// Self {
// prob: prediction.prob,
Expand All @@ -81,21 +55,7 @@ impl TryFrom<Prediction> for Identification<String> {
}
}

// impl From<fasttext2::Prediction> for Identification {
// fn from(prediction: fasttext2::Prediction) -> Self {
// let label = prediction
// .label()
// .chars()
// .skip(9)
// .collect::<String>(())
// .unwrap();
// todo!()
// }
// }
pub trait Identifier<T: Deref<Target = str> + Clone> {
/// returns a language identification token (from [crate::lang::LANG]).
fn identify(&self, sentence: T) -> Result<Option<Identification<T>>, Error>;
}
//

#[cfg(test)]
mod tests {
Expand All @@ -106,6 +66,7 @@ mod tests {
use crate::identifiers::tag_convert::Tag;

use super::Identification;
// use oscar_io::common::Identification;

#[test]
fn test_from_pred() {
Expand All @@ -126,7 +87,7 @@ mod tests {

let old: Identification<String> =
Identification::new(Tag::new(&old.label).try_into().unwrap(), old.prob);
Identification::new(Tag::new(&old.label).try_into().unwrap(), old.prob);
Identification::new(Tag::new(&old.label()).try_into().unwrap(), *old.prob());

let prob = 1.0f32;
let label = "__label__eng".to_string();
Expand Down
2 changes: 2 additions & 0 deletions src/identifiers/multilingual.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use log::debug;
use crate::filtering::Filter;

use super::identification::Identification;
// use oscar_io::common::Identification;

/// Strict Multilingual detector
///
Expand Down Expand Up @@ -303,6 +304,7 @@ mod tests {
},
};
use lazy_static::lazy_static;
// use oscar_io::common::Identification;
use oxilangtag::LanguageTag;

lazy_static! {
Expand Down
17 changes: 8 additions & 9 deletions src/io/langfiles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ use oxilangtag::LanguageTag;
use crate::error;
use crate::error::Error;

use super::writer::{WriterDoc, WriterTrait};
// use super::writer::{WriterDoc, WriterTrait};
use oscar_io::v3::{Writer, WriterTrait};
/// Holds references to [Writer].
// pub struct LangFiles {
// writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
// }

type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>;
type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<Writer>>>;
pub struct LangFilesDoc {
writers: Arc<RwLock<LanguageMap>>,
dst: PathBuf,
Expand Down Expand Up @@ -87,8 +88,8 @@ impl LangFilesDoc {
dst: &Path,
lang: LanguageTag<String>,
part_size_bytes: Option<u64>,
) -> Result<Arc<Mutex<WriterDoc>>, Error> {
let w = WriterDoc::new(dst, lang, part_size_bytes)?;
) -> Result<Arc<Mutex<Writer>>, Error> {
let w = Writer::new(dst, lang, part_size_bytes)?;

Ok(Arc::new(Mutex::new(w)))
}
Expand Down Expand Up @@ -123,7 +124,7 @@ impl LangFilesDoc {
// pub fn writers(&self) -> Arc<HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>> {
pub fn writers(
&self,
) -> std::sync::RwLockReadGuard<HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>> {
) -> std::sync::RwLockReadGuard<HashMap<LanguageTag<String>, Arc<Mutex<Writer>>>> {
self.writers.read().unwrap()
}

Expand All @@ -143,13 +144,11 @@ mod tests {

use std::{fs::File, path::PathBuf};

use crate::{
identifiers::identification::Identification,
pipelines::oscardoc::types::{Document, Metadata},
};
use crate::pipelines::oscardoc::types::{Document, Metadata};
use warc::{BufferedBody, Record, WarcHeader};

use super::*;
use oscar_io::common::Identification;
use tempfile::tempdir;

type WarcHeaders = HashMap<WarcHeader, Vec<u8>>;
Expand Down
2 changes: 0 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ Textual/contextual data saving and loading.
Currently only saving is implemented but loading is planned in order to facilitate operations on already generated corpora.
!*/
mod langfiles;
pub mod reader;
pub mod writer;
// pub use langfiles::LangFiles;
pub use langfiles::LangFilesDoc;
// pub use writer::Writer;

0 comments on commit 5368804

Please sign in to comment.