Skip to content

Commit

Permalink
Expand file tokenisation (#53)
Browse files Browse the repository at this point in the history
* Add new file and directory terms to AFSO

Add file and directory terms used for AFSO

* Update Tokenization for Directories

Update Tokenization for Directories by using the new Tokens from 5d22ada

* Update Tokenization for Files

Update Tokenization for Files by using the new Tokens from 5d22ada

* Fix typo in workflows

* Rename Tokens

Rename specific Tokens for clarity

* Rework special tokenisation

* Add arc-like structure for testing

* Add requested changes

* Add parseARCFileSystem to TopLevelParsers

* Move arcStructure test files

* Add .gitkeep to arcStructureTests

Add .gitkeep to arcStructureTests in oder to track folder

* Add Test for ARC Tokenisation

* Update ArcStructure in ARCTokenizationTests

* Update parser Functions

- Rework MetadataSheet parsers by using the new Tokens
- Add parseProcessGraphColumnsFromToken  and parseProcessGraphColumnsFromTokens

* Update ISA tests by using the new ARCTest structure

* Update TopLevelParsers.fs

Adress the requested Changes

* Adress requested changes
  • Loading branch information
LibraChris committed Mar 2, 2024
1 parent e42eb18 commit 776ca5a
Show file tree
Hide file tree
Showing 22 changed files with 2,030 additions and 1,699 deletions.
30 changes: 29 additions & 1 deletion src/ARCTokenization/FileSystem.fs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ open ARCTokenization.StructuralOntology
open System.IO
open System
open ControlledVocabulary
open Tokenization

module internal FS =

Expand Down Expand Up @@ -51,4 +52,31 @@ module internal FS =
cvTerm = AFSO.``File Path``,
v = file.Replace("\\","/")
)
}
}


let internal normalisePath (path:string) =
path.Replace("\\","/")

let tokenizeARCFileSystem (rootPath:string) =
let rootPathNormalised = rootPath|>normalisePath

let directories =
Directory.EnumerateDirectories(rootPath, "*", SearchOption.AllDirectories)
|> Seq.map(fun p ->
Tokenization.ArcFileSystem.PType.Directory,
p|>normalisePath
)

let files =
Directory.EnumerateFiles(rootPath, "*", SearchOption.AllDirectories)
|> Seq.map(fun p ->
Tokenization.ArcFileSystem.PType.File,
p|>normalisePath
)
let collection: (Tokenization.ArcFileSystem.PType * string) seq = Seq.concat (seq{directories;files})

collection
|>Seq.map(fun (pType,p) -> ArcFileSystem.getArcFileSystemTokens rootPathNormalised pType p)


38 changes: 38 additions & 0 deletions src/ARCTokenization/Tokenization.fs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ open ControlledVocabulary
open FsSpreadsheet
open MetadataSheet
open ARCTokenization.Terms
open ARCtrl
open ARCtrl.ISA

module Tokenization =
Expand Down Expand Up @@ -158,3 +159,40 @@ module Tokenization =
at.Columns
|> Array.map CompositeColumn.tokenize
|> List.ofArray

module ArcFileSystem =

/// Represents the type of file system entity (Directory or File)
type PType =
| File
| Directory

/// Matches a CvParam based on the relative path and file system type
let convertRelativePath (pType:PType) (relativePath: string) =
match pType with
| PType.Directory ->
match (relativePath.Split '/') with
| [|Path.StudiesFolderName|] -> StructuralOntology.AFSO.``Studies Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.StudiesFolderName; _|] -> StructuralOntology.AFSO.``Study Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.AssaysFolderName|] -> StructuralOntology.AFSO.``Assays Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.AssaysFolderName; _|] -> StructuralOntology.AFSO.``Assay Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.RunsFolderName|] -> StructuralOntology.AFSO.``Runs Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.RunsFolderName; _|] -> StructuralOntology.AFSO.``Run Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.WorkflowsFolderName|] -> StructuralOntology.AFSO.``Workflows Directory`` |> fun t -> CvParam(t,relativePath)
| [|Path.WorkflowsFolderName; _|] -> StructuralOntology.AFSO.``Workflow Directory`` |> fun t -> CvParam(t,relativePath)
| _ -> StructuralOntology.AFSO.``Directory Path`` |> fun t -> CvParam(t,relativePath)
| PType.File ->
match relativePath with
| _ when relativePath.EndsWith "isa.investigation.xlsx" -> StructuralOntology.AFSO.``Investigation File`` |> fun t -> CvParam(t,relativePath)
| _ when relativePath.EndsWith "isa.assay.xlsx" -> StructuralOntology.AFSO.``Assay File`` |> fun t -> CvParam(t,relativePath)
| _ when relativePath.EndsWith "isa.dataset.xlsx" -> StructuralOntology.AFSO.``Dataset File`` |> fun t -> CvParam(t,relativePath)
| _ when relativePath.EndsWith "isa.study.xlsx" -> StructuralOntology.AFSO.``Study File`` |> fun t -> CvParam(t,relativePath)
| _ when relativePath.EndsWith ".yml" -> StructuralOntology.AFSO.``YML File`` |> fun t -> CvParam(t,relativePath)
| _ when relativePath.EndsWith ".cwl" -> StructuralOntology.AFSO.``CWL File`` |> fun t -> CvParam(t,relativePath)
| _ -> StructuralOntology.AFSO.``File Path`` |> fun t -> CvParam(t,relativePath)

/// Gets CvParams based on the root path, file system type, and full path
let getArcFileSystemTokens (rootPath:string) (pType:PType) (path:string) =
let relativePath = path.Replace(rootPath,"").TrimStart('/')
convertRelativePath pType relativePath

0 comments on commit 776ca5a

Please sign in to comment.