Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move IO out of Ungoliant #83

Merged
merged 4 commits into from
Jan 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ unic-ucd = "0.9.0"
oxilangtag = {version="0.1.3", features=["serde"]}
language-tags = "0.3.2"
lazy_static = "1.4.0"
oscar-io = "0.1.3"
oscar-io = "0.2.0"
tlsh = {git="https://github.com/Uinelj/tlsh-rs", branch="fix-q3-panic"}
ctclib = {git="https://github.com/Uinelj/ctclib", optional=true}

Expand Down
21 changes: 21 additions & 0 deletions src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,66 +23,87 @@ pub enum Error {
IncompleteLocation(IncompleteLocation),
Avro(avro_rs::Error),
Csv(csv::Error),
OscarIo(oscar_io::Error),
}

#[cfg(not(tarpaulin_include))]
impl From<oscar_io::Error> for Error {
fn from(v: oscar_io::Error) -> Self {
Self::OscarIo(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<LanguageTagParseError> for Error {
fn from(v: LanguageTagParseError) -> Self {
Self::Languagetag(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<csv::Error> for Error {
fn from(v: csv::Error) -> Self {
Self::Csv(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<avro_rs::Error> for Error {
fn from(v: avro_rs::Error) -> Self {
Self::Avro(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<ut1_blocklist::Error> for Error {
fn from(v: ut1_blocklist::Error) -> Self {
Self::Ut1(v)
}
}

#[cfg(not(tarpaulin_include))]
impl From<std::io::Error> for Error {
fn from(e: std::io::Error) -> Error {
Error::Io(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<glob::GlobError> for Error {
fn from(e: glob::GlobError) -> Error {
Error::Glob(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<glob::PatternError> for Error {
fn from(e: glob::PatternError) -> Error {
Error::GlobPattern(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<warc::Error> for Error {
fn from(e: warc::Error) -> Error {
Error::Warc(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<String> for Error {
fn from(s: String) -> Error {
Error::Custom(s)
}
}

#[cfg(not(tarpaulin_include))]
impl From<FromUtf8Error> for Error {
fn from(e: FromUtf8Error) -> Error {
Error::MetadataConversion(e)
}
}

#[cfg(not(tarpaulin_include))]
impl From<serde_json::Error> for Error {
fn from(e: serde_json::Error) -> Error {
Error::Serde(e)
Expand Down
83 changes: 22 additions & 61 deletions src/identifiers/identification.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,56 +9,30 @@ use fasttext::Prediction;

use oxilangtag::{LanguageTag, LanguageTagParseError};

use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct Identification<T: Deref<Target = str> + Clone> {
label: LanguageTag<T>,
prob: f32,
}
use oscar_io::common::Identification as IdentificationExternal;

#[derive(Debug, Clone, Serialize, Deserialize)]
struct IdentificationSer {
label: String,
prob: f32,
}
use serde::{Deserialize, Serialize};

impl<T> From<Identification<T>> for IdentificationSer
where
T: Deref<Target = str> + Clone,
{
fn from(i: Identification<T>) -> Self {
Self {
label: i.label.to_string(),
prob: i.prob,
}
/// newtype idiom over [oscar_io::Identification]
#[derive(Debug, Clone)]
pub struct Identification<T: Deref<Target = str> + Clone>(IdentificationExternal<T>);
impl<T: Deref<Target = str> + Clone> Identification<T> {
pub(crate) fn new(label: LanguageTag<T>, prob: f32) -> Identification<T> {
Self(IdentificationExternal::new(label, prob))
}
}
impl TryFrom<IdentificationSer> for Identification<String> {
type Error = LanguageTagParseError;
fn try_from(i: IdentificationSer) -> Result<Self, Self::Error> {
Ok(Self {
label: LanguageTag::parse(i.label)?,
prob: i.prob,
})

pub fn into_inner(self) -> IdentificationExternal<T> {
self.0
}
}

impl<T: Deref<Target = str> + Clone> Identification<T> {
pub fn new(label: LanguageTag<T>, prob: f32) -> Self {
Self { label, prob }
}
/// Get a reference to the identification's label.
pub fn label(&self) -> &LanguageTag<T> {
&self.label
}
impl<T: Deref<Target = str> + Clone> Deref for Identification<T> {
type Target = IdentificationExternal<T>;

/// Get a reference to the identification's prob.
pub fn prob(&self) -> &f32 {
&self.prob
fn deref(&self) -> &Self::Target {
&self.0
}
}

/// for fasttext2 predictions
impl TryFrom<Prediction> for Identification<String> {
type Error = LanguageTagParseError;
Expand All @@ -69,10 +43,10 @@ impl TryFrom<Prediction> for Identification<String> {
//convert to valid bcp47
let label = label.replace('_', "-");

Ok(Self {
prob: prediction.prob,
label: LanguageTag::parse_and_normalize(&label)?,
})
Ok(Self(IdentificationExternal::new(
LanguageTag::parse_and_normalize(&label)?,
prediction.prob,
)))
// debug!("{prediction:?}");
// Self {
// prob: prediction.prob,
Expand All @@ -81,21 +55,7 @@ impl TryFrom<Prediction> for Identification<String> {
}
}

// impl From<fasttext2::Prediction> for Identification {
// fn from(prediction: fasttext2::Prediction) -> Self {
// let label = prediction
// .label()
// .chars()
// .skip(9)
// .collect::<String>(())
// .unwrap();
// todo!()
// }
// }
pub trait Identifier<T: Deref<Target = str> + Clone> {
/// returns a language identification token (from [crate::lang::LANG]).
fn identify(&self, sentence: T) -> Result<Option<Identification<T>>, Error>;
}
//

#[cfg(test)]
mod tests {
Expand All @@ -106,6 +66,7 @@ mod tests {
use crate::identifiers::tag_convert::Tag;

use super::Identification;
// use oscar_io::common::Identification;

#[test]
fn test_from_pred() {
Expand All @@ -126,7 +87,7 @@ mod tests {

let old: Identification<String> =
Identification::new(Tag::new(&old.label).try_into().unwrap(), old.prob);
Identification::new(Tag::new(&old.label).try_into().unwrap(), old.prob);
Identification::new(Tag::new(&old.label()).try_into().unwrap(), *old.prob());

let prob = 1.0f32;
let label = "__label__eng".to_string();
Expand Down
2 changes: 2 additions & 0 deletions src/identifiers/multilingual.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ use log::debug;
use crate::filtering::Filter;

use super::identification::Identification;
// use oscar_io::common::Identification;

/// Strict Multilingual detector
///
Expand Down Expand Up @@ -303,6 +304,7 @@ mod tests {
},
};
use lazy_static::lazy_static;
// use oscar_io::common::Identification;
use oxilangtag::LanguageTag;

lazy_static! {
Expand Down
17 changes: 8 additions & 9 deletions src/io/langfiles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@ use oxilangtag::LanguageTag;
use crate::error;
use crate::error::Error;

use super::writer::{WriterDoc, WriterTrait};
// use super::writer::{WriterDoc, WriterTrait};
use oscar_io::v3::{Writer, WriterTrait};
/// Holds references to [Writer].
// pub struct LangFiles {
// writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
// }

type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>;
type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<Writer>>>;
pub struct LangFilesDoc {
writers: Arc<RwLock<LanguageMap>>,
dst: PathBuf,
Expand Down Expand Up @@ -87,8 +88,8 @@ impl LangFilesDoc {
dst: &Path,
lang: LanguageTag<String>,
part_size_bytes: Option<u64>,
) -> Result<Arc<Mutex<WriterDoc>>, Error> {
let w = WriterDoc::new(dst, lang, part_size_bytes)?;
) -> Result<Arc<Mutex<Writer>>, Error> {
let w = Writer::new(dst, lang, part_size_bytes)?;

Ok(Arc::new(Mutex::new(w)))
}
Expand Down Expand Up @@ -123,7 +124,7 @@ impl LangFilesDoc {
// pub fn writers(&self) -> Arc<HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>> {
pub fn writers(
&self,
) -> std::sync::RwLockReadGuard<HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>> {
) -> std::sync::RwLockReadGuard<HashMap<LanguageTag<String>, Arc<Mutex<Writer>>>> {
self.writers.read().unwrap()
}

Expand All @@ -143,13 +144,11 @@ mod tests {

use std::{fs::File, path::PathBuf};

use crate::{
identifiers::identification::Identification,
pipelines::oscardoc::types::{Document, Metadata},
};
use crate::pipelines::oscardoc::types::{Document, Metadata};
use warc::{BufferedBody, Record, WarcHeader};

use super::*;
use oscar_io::common::Identification;
use tempfile::tempdir;

type WarcHeaders = HashMap<WarcHeader, Vec<u8>>;
Expand Down
2 changes: 0 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@ Textual/contextual data saving and loading.
Currently only saving is implemented but loading is planned in order to facilitate operations on already generated corpora.
!*/
mod langfiles;
pub mod reader;
pub mod writer;
// pub use langfiles::LangFiles;
pub use langfiles::LangFilesDoc;
// pub use writer::Writer;