Merge pull request #82 from oscar-project/dev-housekeeping

refactor: remove old pipelines, old io code and old langtags
oscar-project · Jan 18, 2023 · 9bf4dcb · 9bf4dcb
2 parents de47f22 + 48a60f5
commit 9bf4dcb
Show file tree

Hide file tree

Showing 40 changed files with 135 additions and 4,041 deletions.
diff --git a/src/cli.rs b/src/cli.rs
@@ -11,15 +11,15 @@ pub enum Ungoliant {
     Download(Download),
     #[structopt(about = "Run pipeline")]
     Pipeline(Pipeline),
-    #[structopt(about = "Deduplicate a generated, not split corpus.")]
-    Dedup(Dedup),
-    #[structopt(about = "Split a not split corpus")]
-    Split(Split),
-    #[structopt(about = "Compress")]
-    Compress(Compress),
-    #[structopt(about = "package")]
-    Package(Package),
-    #[structopt(about = "rebuild the corpus for a given language")]
+    // #[structopt(about = "Deduplicate a generated, not split corpus.")]
+    // Dedup(Dedup),
+    // #[structopt(about = "Split a not split corpus")]
+    // Split(Split),
+    // #[structopt(about = "Compress")]
+    // Compress(Compress),
+    // #[structopt(about = "package")]
+    // Package(Package),
+    // #[structopt(about = "rebuild the corpus for a given language")]
     Rebuild(Rebuild),
     #[structopt(about = "check for corpus validity")]
     Check(Check),

diff --git a/src/filtering/sentence.rs b/src/filtering/sentence.rs
@@ -3,12 +3,6 @@ use super::filter::FilterMut;
 use super::Filter;
 use std::convert::TryInto;
 
-/// regroups sentence filter kinds
-enum FilterKind {
-    Length(Length),
-    MeanLength(MeanLength),
-}
-
 /// Simple length filter.
 /// Returns `false` if provided sentence is less than [Length::min_size] unicode codepoints.
 ///

diff --git a/src/identifiers/model.rs b/src/identifiers/model.rs
@@ -60,13 +60,6 @@ pub struct FastText {
     pub threshold: f32,
 }
 
-impl FastText {
-    /// removes __label__ from identification start
-    fn clean_label(label: &str) -> String {
-        label[..9].to_string()
-    }
-}
-
 /// Prediction for new tags/model
 impl Predict<String> for FastText {
     fn predict_one(&self, line: &str) -> Result<Option<Identification<String>>, Error> {
@@ -215,11 +208,11 @@ impl<'a> FastTextBuilder<'a> {
     }
 
     pub fn build(&self) -> Result<FastText, Error> {
-        let error = if self.path == None {
+        let error = if self.path.is_none() {
             Some("No path provided")
-        } else if self.k == None {
+        } else if self.k.is_none() {
             Some("No k provided")
-        } else if self.threshold == None {
+        } else if self.threshold.is_none() {
             Some("No threshold provided")
         } else {
             None

diff --git a/src/io/langfiles.rs b/src/io/langfiles.rs
@@ -14,15 +14,15 @@ use std::{
 use log::info;
 use oxilangtag::LanguageTag;
 
-use crate::lang::LANG;
-use crate::{error};
-use crate::{error::Error, io::writer::Writer};
+// use crate::lang::LANG;
+use crate::error;
+use crate::error::Error;
 
 use super::writer::{WriterDoc, WriterTrait};
 /// Holds references to [Writer].
-pub struct LangFiles {
-    writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
-}
+// pub struct LangFiles {
+//     writers: HashMap<&'static str, Arc<Mutex<Writer>>>,
+// }
 
 type LanguageMap = HashMap<LanguageTag<String>, Arc<Mutex<WriterDoc>>>;
 pub struct LangFilesDoc {
@@ -31,40 +31,40 @@ pub struct LangFilesDoc {
     part_size_bytes: Option<u64>,
 }
 
-impl LangFiles {
-    /// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
-    /// by part.
-    /// Note that if it is set too low and a unique record can't be stored in an unique part
-    /// then a part will still be created, being larger than the `part_size_bytes`. This is expected behaviour.
-    ///
-    /// Also keep in mind that [Self::close_meta] has to be called once every write is done.
-    ///
-    // [Self::close_meta] could be integrated in an `impl Drop`
-    pub fn new(dst: &Path, part_size_bytes: Option<u64>) -> Result<Self, error::Error> {
-        let mut writers = HashMap::with_capacity(LANG.len());
-        let mut w;
-        for lang in LANG.iter() {
-            w = Writer::new(dst, lang, part_size_bytes)?;
-            writers.insert(*lang, Arc::new(Mutex::new(w)));
-        }
-
-        Ok(Self { writers })
-    }
-
-    /// Get a non-mutable reference to the writers.
-    pub fn writers(&self) -> &HashMap<&'static str, Arc<Mutex<Writer>>> {
-        &self.writers
-    }
-
-    /// Fix open metadata files by removing trailing comma and closing the array.
-    pub fn close_meta(&self) -> Result<(), error::Error> {
-        for writer in self.writers.values() {
-            let mut writer_lock = writer.lock().unwrap();
-            writer_lock.close_meta()?;
-        }
-        Ok(())
-    }
-}
+// impl LangFiles {
+//     /// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
+//     /// by part.
+//     /// Note that if it is set too low and a unique record can't be stored in an unique part
+//     /// then a part will still be created, being larger than the `part_size_bytes`. This is expected behaviour.
+//     ///
+//     /// Also keep in mind that [Self::close_meta] has to be called once every write is done.
+//     ///
+//     // [Self::close_meta] could be integrated in an `impl Drop`
+//     pub fn new(dst: &Path, part_size_bytes: Option<u64>) -> Result<Self, error::Error> {
+//         let mut writers = HashMap::with_capacity(LANG.len());
+//         let mut w;
+//         for lang in LANG.iter() {
+//             w = Writer::new(dst, lang, part_size_bytes)?;
+//             writers.insert(*lang, Arc::new(Mutex::new(w)));
+//         }
+
+//         Ok(Self { writers })
+//     }
+
+//     /// Get a non-mutable reference to the writers.
+//     pub fn writers(&self) -> &HashMap<&'static str, Arc<Mutex<Writer>>> {
+//         &self.writers
+//     }
+
+//     /// Fix open metadata files by removing trailing comma and closing the array.
+//     pub fn close_meta(&self) -> Result<(), error::Error> {
+//         for writer in self.writers.values() {
+//             let mut writer_lock = writer.lock().unwrap();
+//             writer_lock.close_meta()?;
+//         }
+//         Ok(())
+//     }
+// }
 
 impl LangFilesDoc {
     /// Create a new LangFiles. `part_size_bytes` sets an indication of the maximum size
@@ -88,10 +88,6 @@ impl LangFilesDoc {
         lang: LanguageTag<String>,
         part_size_bytes: Option<u64>,
     ) -> Result<Arc<Mutex<WriterDoc>>, Error> {
-        //TODO: remove the box leak?
-        // The idea is that when we encounter a new language we need to keep its
-        // code alive for the rest of the process
-        let lang: &'static str = Box::leak(lang.into_inner().into_boxed_str());
         let w = WriterDoc::new(dst, lang, part_size_bytes)?;
 
         Ok(Arc::new(Mutex::new(w)))
@@ -150,7 +146,6 @@ mod tests {
     use crate::{
         identifiers::identification::Identification,
         pipelines::oscardoc::types::{Document, Metadata},
-        pipelines::oscarmeta::types::MergedPiece,
     };
     use warc::{BufferedBody, Record, WarcHeader};
 
@@ -159,49 +154,6 @@ mod tests {
 
     type WarcHeaders = HashMap<WarcHeader, Vec<u8>>;
 
-    fn create_merged_piece(
-        sentences: String,
-        identification: &'static str,
-        headers: WarcHeaders,
-    ) -> MergedPiece {
-        let nb_sentences = sentences.split('\n').count();
-        MergedPiece {
-            sentences,
-            identification,
-            headers,
-            nb_sentences,
-        }
-    }
-    #[test]
-    fn init() {
-        let dst = Path::new("dst_langfiles_init");
-        std::fs::create_dir(dst).unwrap();
-        let _ = LangFiles::new(dst, Some(10));
-        std::fs::remove_dir_all(dst).unwrap();
-    }
-
-    #[test]
-    fn write_one() {
-        let dst = Path::new("dst_langfiles_write_one");
-        std::fs::create_dir(dst).unwrap();
-        let langfiles = LangFiles::new(dst, Some(10)).unwrap();
-
-        let sentences = "essai d'écriture
-de trois lignes
-hehe :)"
-            .to_string();
-        let headers = vec![(WarcHeader::ContentType, Vec::from("blogpost".as_bytes()))]
-            .into_iter()
-            .collect();
-        let mp = vec![create_merged_piece(sentences, "fr", headers)];
-        // lock mutex and acquire writer
-        let fr_writer = langfiles.writers().get("fr").unwrap().clone();
-        let mut fr_writer_locked = fr_writer.lock().unwrap();
-
-        fr_writer_locked.write(mp).unwrap();
-        std::fs::remove_dir_all(dst).unwrap();
-    }
-
     #[test]
     fn init_doc() {
         let dst = tempdir().unwrap();

diff --git a/src/io/mod.rs b/src/io/mod.rs
@@ -8,6 +8,6 @@ Currently only saving is implemented but loading is planned in order to facilita
 mod langfiles;
 pub mod reader;
 pub mod writer;
-pub use langfiles::LangFiles;
+// pub use langfiles::LangFiles;
 pub use langfiles::LangFilesDoc;
-pub use writer::Writer;
+// pub use writer::Writer;
diff --git a/src/io/reader/corpus.rs b/src/io/reader/corpus.rs