Skip to content

Commit

Permalink
Merge pull request #49 from nfdi4plants/feature-fsTokenBasedMetadataP…
Browse files Browse the repository at this point in the history
…arsing-#46

FS token-based metadata parsing
  • Loading branch information
kMutagene committed Jan 10, 2024
2 parents 829e11b + 4d84f7f commit 49bbb27
Show file tree
Hide file tree
Showing 10 changed files with 340 additions and 67 deletions.
31 changes: 27 additions & 4 deletions playground.fsx
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,9 @@
//File.Copy(dllBasePath + "/FsSpreadsheet.ExcelIO/bin/Debug/netstandard2.0/FsSpreadsheet.ExcelIO.dll", dllBasePath + "/FsSpreadsheet.ExcelIO/bin/Debug/netstandard2.0/FsSpreadsheet.ExcelIO_Copy.dll", true)
//File.Copy(@"C:\Repos\nfdi4plants\ArcGraphModel\src\ArcGraphModel\bin\Debug\net6.0\ArcGraphModel.dll", @"C:\Repos\nfdi4plants\ArcGraphModel\src\ArcGraphModel\bin\Debug\net6.0\ArcGraphModel_Copy.dll", true)

#r "nuget: DocumentFormat.OpenXml"
#r "nuget: FSharpAux"
#r "nuget: FsOboParser"
#r "nuget: FsSpreadsheet, 3.1.1"
#r "nuget: FsSpreadsheet.ExcelIO, 3.1.1"
#r "nuget: FSharp.FGL"
#r "nuget: FsSpreadsheet.ExcelIO, 4.1.0"
#r "nuget: FSharp.FGL.ArrayAdjacencyGraph"

open DocumentFormat.OpenXml
Expand Down Expand Up @@ -45,6 +42,32 @@ open FsOboParser
open ControlledVocabulary
open type ControlledVocabulary.ParamBase
open ARCTokenization
open ARCTokenization.StructuralOntology

System.IO.Directory.GetCurrentDirectory()
let fakePath = CvParam(cvTerm = AFSO.``File Path``, v = System.IO.Directory.GetCurrentDirectory() + "/tests/ARCTokenization.Tests/Fixtures/correct/investigation_simple.xlsx")

let fakePath = CvParam(cvTerm = AFSO.``File Path``, v = "tests/ARCTokenization.Tests/Fixtures/correct/assay_simple.xlsx")
let actual = ParamBasedParsers.parseIsaMetadataSheetFromCvp "assay_simple.xlsx" Assay.parseMetadataSheetFromFile [fakePath] |> Seq.head
actual.Length
let exp =
ARCMock.AssayMetadataTokens(
Assay_File_Name = [@"measurement1\isa.assay.xlsx"],
Assay_Performer_First_Name = ["Oliver"; "Marius"],
Assay_Performer_Last_Name = ["Maus"; "Katz"],
Assay_Performer_Mid_Initials = [""; "G."],
Assay_Performer_Email = ["maus@nfdi4plants.org"],
Assay_Performer_Affiliation = ["RPTU University of Kaiserslautern"],
Assay_Performer_Roles = ["research assistant"],
Assay_Performer_Roles_Term_Accession_Number = ["http://purl.org/spar/scoro/research-assistant"],
Assay_Performer_Roles_Term_Source_REF = ["scoro"]
)
|> List.concat
exp.Length
actual |> List.fold (fun acc ip -> $"{acc}\n{ip.Name}") "" |> printfn "%s"
exp |> List.iter (fun ip -> printfn $"{ip.Name}")
for i = 0 to 33 do
printfn $"{List.tryItem i actual |> Option.map (fun x -> x.Name) |> Option.defaultValue System.String.Empty}\t{List.tryItem i exp |> Option.map (fun x -> x.Name) |> Option.defaultValue System.String.Empty}"

let testAccession1 = "TO:00000001"
let testName1 = "Test"
Expand Down
1 change: 1 addition & 0 deletions src/ARCTokenization/ARCTokenization.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
<None Include="structural_ontologies/investigation_metadata_structural_ontology.yml" />
<None Include="structural_ontologies/study_metadata_structural_ontology.yml" />
<None Include="structural_ontologies/assay_metadata_structural_ontology.yml" />
<Compile Include="Globals.fs" />
<Compile Include="Address.fs" />
<Compile Include="Terms.fs" />
<Compile Include="Regex.fs" />
Expand Down
15 changes: 15 additions & 0 deletions src/ARCTokenization/Globals.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
module Globals

open ARCtrl
open ARCtrl.ISA

let [<Literal>] INVESTIGATION_FILE_NAME = Path.InvestigationFileName
let [<Literal>] INVESTIGATION_METADATA_SHEET_NAME = ARCtrl.ISA.Spreadsheet.ArcInvestigation.metaDataSheetName

let [<Literal>] STUDY_FILE_NAME = Path.StudyFileName
let [<Literal>] STUDY_METADATA_SHEET_NAME = ARCtrl.ISA.Spreadsheet.ArcStudy.metaDataSheetName
let [<Literal>] STUDY_OBSOLETE_METADATA_SHEET_NAME= ARCtrl.ISA.Spreadsheet.ArcStudy.obsoleteMetaDataSheetName

let [<Literal>] ASSAY_FILE_NAME = Path.AssayFileName
let [<Literal>] ASSAY_METADATA_SHEET_NAME = ARCtrl.ISA.Spreadsheet.ArcAssay.metaDataSheetName
let [<Literal>] ASSAY_OBSOLETE_METADATA_SHEET_NAME = ARCtrl.ISA.Spreadsheet.ArcAssay.obsoleteMetaDataSheetName
269 changes: 217 additions & 52 deletions src/ARCTokenization/TopLevelParsers.fs

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions src/ARCTokenization/Workbook.fs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ module Workbook =

let getInvestigationMetadataSheet (useLastSheetOnIncorrectName: bool) investigation =
try
FsWorkbook.getWorksheetByName "isa_investigation" investigation
FsWorkbook.getWorksheetByName Globals.INVESTIGATION_METADATA_SHEET_NAME investigation
with _ ->
if useLastSheetOnIncorrectName then
FsWorkbook.getWorksheets investigation
Expand All @@ -19,22 +19,22 @@ module Workbook =

let getStudyMetadataSheet (useLastSheetOnIncorrectName: bool) study =
try
FsWorkbook.tryGetWorksheetByName "Study" study
|> Option.defaultValue (FsWorkbook.getWorksheetByName "isa_study" study)
FsWorkbook.tryGetWorksheetByName Globals.STUDY_OBSOLETE_METADATA_SHEET_NAME study
|> Option.defaultValue (FsWorkbook.getWorksheetByName Globals.STUDY_METADATA_SHEET_NAME study)
with _ ->
if useLastSheetOnIncorrectName then
FsWorkbook.getWorksheets study
|> Seq.last
else
failwith "No worksheet named 'Study' or 'isa_study' found in the workbook"
failwith $"No worksheet named {Globals.STUDY_OBSOLETE_METADATA_SHEET_NAME} or {Globals.STUDY_METADATA_SHEET_NAME} found in the workbook"

let getAssayMetadataSheet (useLastSheetOnIncorrectName: bool) assay =
try
FsWorkbook.tryGetWorksheetByName "Assay" assay
|> Option.defaultValue (FsWorkbook.getWorksheetByName "isa_assay" assay)
FsWorkbook.tryGetWorksheetByName Globals.ASSAY_OBSOLETE_METADATA_SHEET_NAME assay
|> Option.defaultValue (FsWorkbook.getWorksheetByName Globals.ASSAY_METADATA_SHEET_NAME assay)
with _ ->
if useLastSheetOnIncorrectName then
FsWorkbook.getWorksheets assay
|> Seq.last
else
failwith "No worksheet named 'Assay' or 'isa_assay' found in the workbook"
failwith $"No worksheet named {Globals.ASSAY_OBSOLETE_METADATA_SHEET_NAME} or {Globals.ASSAY_METADATA_SHEET_NAME} found in the workbook"
Binary file not shown.
Binary file not shown.
31 changes: 30 additions & 1 deletion tests/ARCTokenization.Tests/IntegrationTests/AssayMetadata.fs
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,33 @@ module AssayMetadata =

let allExpectedMetadataTermsEmpty =
Terms.AssayMetadata.nonObsoleteNonRootCvTerms
|> List.map (fun p -> CvParam(p, ParamValue.CvValue (CvTerm.create(accession = "AGMO:00000001", name = "Metadata Section Key", ref = "AGMO")), []))
|> List.map (fun p -> CvParam(p, ParamValue.CvValue (CvTerm.create(accession = "AGMO:00000001", name = "Metadata Section Key", ref = "AGMO")), []))

open ARCTokenization.StructuralOntology

let allExpectedMetadataTermsFull =
ARCMock.AssayMetadataTokens(
Assay_File_Name = [@"measurement1\isa.assay.xlsx"],
Assay_Performer_First_Name = ["Oliver"; "Marius"],
Assay_Performer_Last_Name = ["Maus"; "Katz"],
Assay_Performer_Mid_Initials = [""; "G."],
Assay_Performer_Email = ["maus@nfdi4plants.org"],
Assay_Performer_Affiliation = ["RPTU University of Kaiserslautern"],
Assay_Performer_Roles = ["research assistant"],
Assay_Performer_Roles_Term_Accession_Number = ["http://purl.org/spar/scoro/research-assistant"],
Assay_Performer_Roles_Term_Source_REF = ["scoro"]
)
|> List.concat // use flat list

[<Fact>]
let ``Simple study is parsed from filepath CvParam with all structural ontology terms in order`` () =
let fakePath = CvParam(cvTerm = AFSO.``File Path``, v = "Fixtures/correct/assay_simple.xlsx")
let actual =
[fakePath]
|> Assay.parseMetadataSheetsFromTokens(
FileName = "assay_simple.xlsx"
)
|> Seq.head
Assert.All((List.zip allExpectedMetadataTermsFull actual), (fun (expected,actual) ->
CvParam.structuralEquality (expected) (actual :?> CvParam)
))
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ module InvestigationMetadata =

open TestUtils

let parsedInvestigationMetadataEmpty = Investigation.parseMetadataSheetFromFile "Fixtures/incorrect/investigation_empty.xlsx"
let parsedInvestigationMetadataSimple = Investigation.parseMetadataSheetFromFile "Fixtures/correct/investigation_simple.xlsx"
let parsedInvestigationMetadataEmpty = Investigation.parseMetadataSheetFromFile() "Fixtures/incorrect/investigation_empty.xlsx"
let parsedInvestigationMetadataSimple = Investigation.parseMetadataSheetFromFile() "Fixtures/correct/investigation_simple.xlsx"

let allExpectedMetadataTermsEmpty =
Terms.InvestigationMetadata.nonObsoleteNonRootCvTerms
Expand Down Expand Up @@ -55,4 +55,20 @@ module InvestigationMetadata =
let ``Simple investigation is parsed with all structural ontology terms in order`` () =
Assert.All((List.zip allExpectedMetadataTermsFull parsedInvestigationMetadataSimple), (fun (expected,actual) ->
CvParam.structuralEquality (expected) (actual :?> CvParam)
))

open ARCTokenization.StructuralOntology

[<Fact>]
let ``Simple investigation is parsed from filepath CvParam with all structural ontology terms in order`` () =
let fakePath = CvParam(cvTerm = AFSO.``File Path``, v = "Fixtures/correct/investigation_simple.xlsx")
let actual =
[fakePath]
|> Investigation.parseMetadataSheetsFromTokens(
FileName = "investigation_simple.xlsx"
)
|> Seq.head

Assert.All((List.zip allExpectedMetadataTermsFull actual), (fun (expected,actual) ->
CvParam.structuralEquality (expected) (actual :?> CvParam)
))
26 changes: 25 additions & 1 deletion tests/ARCTokenization.Tests/IntegrationTests/StudyMetadata.fs
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,28 @@ module StudyMetadata =

let allExpectedMetadataTermsEmpty =
Terms.StudyMetadata.nonObsoleteNonRootCvTerms
|> List.map (fun p -> CvParam(p, ParamValue.CvValue (CvTerm.create(accession = "AGMO:00000001", name = "Metadata Section Key", ref = "AGMO")), []))
|> List.map (fun p -> CvParam(p, ParamValue.CvValue (CvTerm.create(accession = "AGMO:00000001", name = "Metadata Section Key", ref = "AGMO")), []))

let allExpectedMetadataTermsFull =
ARCMock.StudyMetadataTokens(
Study_Identifier = ["experiment1_material"],
Study_Title = ["Prototype for experimental data"],
Study_Description = ["In this a devised study to have an exemplary experimental material description."],
Study_File_Name = [@"experiment1_material\isa.study.xlsx"]
)
|> List.concat // use flat list

open ARCTokenization.StructuralOntology

[<Fact>]
let ``Simple study is parsed from filepath CvParam with all structural ontology terms in order`` () =
let fakePath = CvParam(cvTerm = AFSO.``File Path``, v = "Fixtures/correct/study_simple.xlsx")
let actual =
[fakePath]
|> Study.parseMetadataSheetsFromTokens(
FileName = "study_simple.xlsx"
)
|> Seq.head
Assert.All((List.zip allExpectedMetadataTermsFull actual), (fun (expected,actual) ->
CvParam.structuralEquality (expected) (actual :?> CvParam)
))

0 comments on commit 49bbb27

Please sign in to comment.