Skip to content

Commit

Permalink
Merge pull request #82 from oscar-project/dev-housekeeping
Browse files Browse the repository at this point in the history
refactor: remove old pipelines, old io code and old langtags
  • Loading branch information
Uinelj committed Jan 18, 2023
2 parents de47f22 + 48a60f5 commit 9bf4dcb
Show file tree
Hide file tree
Showing 40 changed files with 135 additions and 4,041 deletions.
18 changes: 9 additions & 9 deletions src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ pub enum Ungoliant {
Download(Download),
#[structopt(about = "Run pipeline")]
Pipeline(Pipeline),
#[structopt(about = "Deduplicate a generated, not split corpus.")]
Dedup(Dedup),
#[structopt(about = "Split a not split corpus")]
Split(Split),
#[structopt(about = "Compress")]
Compress(Compress),
#[structopt(about = "package")]
Package(Package),
#[structopt(about = "rebuild the corpus for a given language")]
// #[structopt(about = "Deduplicate a generated, not split corpus.")]
// Dedup(Dedup),
// #[structopt(about = "Split a not split corpus")]
// Split(Split),
// #[structopt(about = "Compress")]
// Compress(Compress),
// #[structopt(about = "package")]
// Package(Package),
// #[structopt(about = "rebuild the corpus for a given language")]
Rebuild(Rebuild),
#[structopt(about = "check for corpus validity")]
Check(Check),
Expand Down
6 changes: 0 additions & 6 deletions src/filtering/sentence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,6 @@ use super::filter::FilterMut;
use super::Filter;
use std::convert::TryInto;

/// regroups sentence filter kinds
enum FilterKind {
Length(Length),
MeanLength(MeanLength),
}

/// Simple length filter.
/// Returns `false` if provided sentence is less than [Length::min_size] unicode codepoints.
///
Expand Down
13 changes: 3 additions & 10 deletions src/identifiers/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,6 @@ pub struct FastText {
pub threshold: f32,
}

impl FastText {
/// removes __label__ from identification start
fn clean_label(label: &str) -> String {
label[..9].to_string()
}
}

/// Prediction for new tags/model
impl Predict<String> for FastText {
fn predict_one(&self, line: &str) -> Result<Option<Identification<String>>, Error> {
Expand Down Expand Up @@ -215,11 +208,11 @@ impl<'a> FastTextBuilder<'a> {
}

pub fn build(&self) -> Result<FastText, Error> {
let error = if self.path == None {
let error = if self.path.is_none() {
Some("No path provided")
} else if self.k == None {
} else if self.k.is_none() {
Some("No k provided")
} else if self.threshold == None {
} else if self.threshold.is_none() {
Some("No threshold provided")
} else {
None
Expand Down
128 changes: 40 additions & 88 deletions src/io/langfiles.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ use std::{
use log::info;
use oxilangtag::LanguageTag;

use crate::lang::LANG;
use crate::{error};
use crate::{error::Error, io::writer::Writer};
// use crate::lang::LANG;
use crate::error;
use crate::error::Error;

use super::writer::{WriterDoc, WriterTrait};
/// Holds references to [Writer].
pub struct LangFiles {
writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
}
// pub struct LangFiles {
// writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
// }

type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>;
pub struct LangFilesDoc {
Expand All @@ -31,40 +31,40 @@ pub struct LangFilesDoc {
part_size_bytes: Option<u64>,
}

impl LangFiles {
/// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
/// by part.
/// Note that if it is set too low and a unique record can't be stored in an unique part
/// then a part will still be created, being larger than the `part_size_bytes`. This is expected behaviour.
///
/// Also keep in mind that [Self::close_meta] has to be called once every write is done.
///
// [Self::close_meta] could be integrated in an `impl Drop`
pub fn new(dst: &Path, part_size_bytes: Option<u64>) -> Result<Self, error::Error> {
let mut writers = HashMap::with_capacity(LANG.len());
let mut w;
for lang in LANG.iter() {
w = Writer::new(dst, lang, part_size_bytes)?;
writers.insert(*lang, Arc::new(Mutex::new(w)));
}

Ok(Self { writers })
}

/// Get a non-mutable reference to the writers.
pub fn writers(&self) -> &HashMap<&'static str, Arc<Mutex<Writer>>> {
&self.writers
}

/// Fix open metadata files by removing trailing comma and closing the array.
pub fn close_meta(&self) -> Result<(), error::Error> {
for writer in self.writers.values() {
let mut writer_lock = writer.lock().unwrap();
writer_lock.close_meta()?;
}
Ok(())
}
}
// impl LangFiles {
// /// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
// /// by part.
// /// Note that if it is set too low and a unique record can't be stored in an unique part
// /// then a part will still be created, being larger than the `part_size_bytes`. This is expected behaviour.
// ///
// /// Also keep in mind that [Self::close_meta] has to be called once every write is done.
// ///
// // [Self::close_meta] could be integrated in an `impl Drop`
// pub fn new(dst: &Path, part_size_bytes: Option<u64>) -> Result<Self, error::Error> {
// let mut writers = HashMap::with_capacity(LANG.len());
// let mut w;
// for lang in LANG.iter() {
// w = Writer::new(dst, lang, part_size_bytes)?;
// writers.insert(*lang, Arc::new(Mutex::new(w)));
// }

// Ok(Self { writers })
// }

// /// Get a non-mutable reference to the writers.
// pub fn writers(&self) -> &HashMap<&'static str, Arc<Mutex<Writer>>> {
// &self.writers
// }

// /// Fix open metadata files by removing trailing comma and closing the array.
// pub fn close_meta(&self) -> Result<(), error::Error> {
// for writer in self.writers.values() {
// let mut writer_lock = writer.lock().unwrap();
// writer_lock.close_meta()?;
// }
// Ok(())
// }
// }

impl LangFilesDoc {
/// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
Expand All @@ -88,10 +88,6 @@ impl LangFilesDoc {
lang: LanguageTag<String>,
part_size_bytes: Option<u64>,
) -> Result<Arc<Mutex<WriterDoc>>, Error> {
//TODO: remove the box leak?
// The idea is that when we encounter a new language we need to keep its
// code alive for the rest of the process
let lang: &'static str = Box::leak(lang.into_inner().into_boxed_str());
let w = WriterDoc::new(dst, lang, part_size_bytes)?;

Ok(Arc::new(Mutex::new(w)))
Expand Down Expand Up @@ -150,7 +146,6 @@ mod tests {
use crate::{
identifiers::identification::Identification,
pipelines::oscardoc::types::{Document, Metadata},
pipelines::oscarmeta::types::MergedPiece,
};
use warc::{BufferedBody, Record, WarcHeader};

Expand All @@ -159,49 +154,6 @@ mod tests {

type WarcHeaders = HashMap<WarcHeader, Vec<u8>>;

fn create_merged_piece(
sentences: String,
identification: &'static str,
headers: WarcHeaders,
) -> MergedPiece {
let nb_sentences = sentences.split('\n').count();
MergedPiece {
sentences,
identification,
headers,
nb_sentences,
}
}
#[test]
fn init() {
let dst = Path::new("dst_langfiles_init");
std::fs::create_dir(dst).unwrap();
let _ = LangFiles::new(dst, Some(10));
std::fs::remove_dir_all(dst).unwrap();
}

#[test]
fn write_one() {
let dst = Path::new("dst_langfiles_write_one");
std::fs::create_dir(dst).unwrap();
let langfiles = LangFiles::new(dst, Some(10)).unwrap();

let sentences = "essai d'écriture
de trois lignes
hehe :)"
.to_string();
let headers = vec![(WarcHeader::ContentType, Vec::from("blogpost".as_bytes()))]
.into_iter()
.collect();
let mp = vec![create_merged_piece(sentences, "fr", headers)];
// lock mutex and acquire writer
let fr_writer = langfiles.writers().get("fr").unwrap().clone();
let mut fr_writer_locked = fr_writer.lock().unwrap();

fr_writer_locked.write(mp).unwrap();
std::fs::remove_dir_all(dst).unwrap();
}

#[test]
fn init_doc() {
let dst = tempdir().unwrap();
Expand Down
4 changes: 2 additions & 2 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,6 @@ Currently only saving is implemented but loading is planned in order to facilita
mod langfiles;
pub mod reader;
pub mod writer;
pub use langfiles::LangFiles;
// pub use langfiles::LangFiles;
pub use langfiles::LangFilesDoc;
pub use writer::Writer;
// pub use writer::Writer;
64 changes: 0 additions & 64 deletions src/io/reader/corpus.rs

This file was deleted.

0 comments on commit 9bf4dcb

Please sign in to comment.