Skip to content

Commit

Permalink
Tagging: throw out old data
Browse files Browse the repository at this point in the history
Morphology: automatically compress resources on build
  • Loading branch information
pdonald committed Feb 28, 2014
1 parent dae419d commit 4fa47d9
Show file tree
Hide file tree
Showing 22 changed files with 1,662 additions and 50,603 deletions.
22 changes: 8 additions & 14 deletions Latvian.LuMii.Tests/Latvian.LuMii.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -74,33 +74,27 @@
<Compile Include="Morphology\LuMiiMorphologyTests.cs" />
<Compile Include="Tagging\LuMiiTaggerTests.cs" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Tagging\Resources\MorphoTest.txt" />
<EmbeddedResource Include="Tagging\Resources\MorphoTrain.txt" />
<EmbeddedResource Include="Tagging\Resources\TaggerTest.txt" />
<EmbeddedResource Include="Tagging\Resources\TaggerTrain.txt" />
</ItemGroup>
<ItemGroup>
<Service Include="{82A7F48D-3B50-4B1E-B82E-3ADA8210C358}" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Tagging\Resources\Morpho2Test.txt" />
<EmbeddedResource Include="Tagging\Resources\Morpho2Train.txt" />
<EmbeddedResource Include="Tagging\Resources\Tagger2Test.txt" />
<EmbeddedResource Include="Tagging\Resources\Tagger2Train.txt" />
<EmbeddedResource Include="Tagging\Resources\Analyzed1Test.txt" />
<EmbeddedResource Include="Tagging\Resources\Analyzed2Train.txt" />
</ItemGroup>
<ItemGroup>
<Compile Include="Tokenization\LuMiiTokenizerTests.cs" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Tagging\Resources\Tagger3Test.txt" />
<EmbeddedResource Include="Tagging\Resources\Tagger3Train.txt" />
<EmbeddedResource Include="Tagging\Resources\Analyzed1Train.txt" />
<EmbeddedResource Include="Tagging\Resources\Analyzed2Test.txt" />
<EmbeddedResource Include="Tagging\Resources\LVTaggerTest.txt" />
<EmbeddedResource Include="Tagging\Resources\LVTaggerTrain.txt" />
</ItemGroup>
<Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
<Target Name="BeforeBuild">
<Copy SourceFiles="$(SolutionDir)\LU MII Morphology\morphology\src\main\resources\train.txt" DestinationFiles="$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTrain.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTrain.txt')" />
<!-- <Copy SourceFiles="$(SolutionDir)\LU MII Morphology\morphology\src\main\resources\train.txt" DestinationFiles="$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTrain.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTrain.txt')" />
<Copy SourceFiles="$(SolutionDir)\LU MII Morphology\morphology\src\main\resources\test.txt" DestinationFiles="$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTest.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Tagging\Resources\MorphoTest.txt')" />
<!-- <Copy SourceFiles="$(SolutionDir)\LU MII Tagger\MorphoCRF\train.txt" DestinationFiles="$(MSBuildProjectDirectory)\Resources\TaggerTrain.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Resources\TaggerTrain.txt')" />
<Copy SourceFiles="$(SolutionDir)\LU MII Tagger\MorphoCRF\train.txt" DestinationFiles="$(MSBuildProjectDirectory)\Resources\TaggerTrain.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Resources\TaggerTrain.txt')" />
<Copy SourceFiles="$(SolutionDir)\LU MII Tagger\MorphoCRF\test.txt" DestinationFiles="$(MSBuildProjectDirectory)\Resources\TaggerTest.txt" Condition="!Exists('$(MSBuildProjectDirectory)\Resources\TaggerTest.txt')" /> -->
</Target>
</Project>
64 changes: 26 additions & 38 deletions Latvian.LuMii.Tests/Tagging/LuMiiTaggerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,35 +30,32 @@ namespace Latvian.Tests.Tagging
[TestFixture]
public class LuMiiTaggerTests
{
private const string MorphoTrain = "Latvian.LuMii.Tests.Tagging.Resources.MorphoTrain.txt";
private const string MorphoTest = "Latvian.LuMii.Tests.Tagging.Resources.MorphoTest.txt";
private const string Morpho2Train = "Latvian.LuMii.Tests.Tagging.Resources.Morpho2Train.txt";
private const string Morpho2Test = "Latvian.LuMii.Tests.Tagging.Resources.Morpho2Test.txt";
private const string TaggerTrain = "Latvian.LuMii.Tests.Tagging.Resources.TaggerTrain.txt";
private const string TaggerTest = "Latvian.LuMii.Tests.Tagging.Resources.TaggerTest.txt";
private const string Tagger2Train = "Latvian.LuMii.Tests.Tagging.Resources.Tagger2Train.txt";
private const string Tagger2Test = "Latvian.LuMii.Tests.Tagging.Resources.Tagger2Test.txt";
private const string Tagger3Train = "Latvian.LuMii.Tests.Tagging.Resources.Tagger3Train.txt";
private const string Tagger3Test = "Latvian.LuMii.Tests.Tagging.Resources.Tagger3Test.txt";

private static readonly string[] ModelLatestData = new[] { Morpho2Train, Morpho2Test };
private const string Analyzed1Train = "Latvian.LuMii.Tests.Tagging.Resources.Analyzed1Train.txt";
private const string Analyzed1Test = "Latvian.LuMii.Tests.Tagging.Resources.Analyzed1Test.txt";
private const string Analyzed2Train = "Latvian.LuMii.Tests.Tagging.Resources.Analyzed2Train.txt";
private const string Analyzed2Test = "Latvian.LuMii.Tests.Tagging.Resources.Analyzed2Test.txt";
private const string LVTaggerTrain = "Latvian.LuMii.Tests.Tagging.Resources.LVTaggerTrain.txt";
private const string LVTaggerTest = "Latvian.LuMii.Tests.Tagging.Resources.LVTaggerTest.txt";
private const string LVTaggerDev = "Latvian.LuMii.Tests.Tagging.Resources.LVTaggerDev.txt";

private static readonly string[] ModelLatestData = new[] { Analyzed2Train, Analyzed2Test };

[Test]
public void CrossValidation_10Fold_Morpho()
{
CrossValidation("morpho", 0.93, 10, LoadAnalyzedCorpus(MorphoTrain), LoadAnalyzedCorpus(MorphoTest));
CrossValidation("morpho", 0.93, 10, LoadAnalyzedCorpus(Analyzed1Train), LoadAnalyzedCorpus(Analyzed1Test));
}

[Test]
public void CrossValidation_10Fold_Morpho2()
{
CrossValidation("morpho2", 0.93, 10, LoadAnalyzedCorpus(Morpho2Train), LoadAnalyzedCorpus(Morpho2Test));
CrossValidation("morpho2", 0.93, 10, LoadAnalyzedCorpus(Analyzed2Train), LoadAnalyzedCorpus(Analyzed2Test));
}

[Test]
public void CrossValidation_10Fold_Tagger()
{
CrossValidation("tagger data", 0.92, 10, LoadUnanalyzedCorpus(TaggerTrain), LoadUnanalyzedCorpus(TaggerTest));
CrossValidation("tagger data", 0.92, 10, LoadUnanalyzedCorpus(LVTaggerTrain), LoadUnanalyzedCorpus(LVTaggerTest));
}

public void CrossValidation(string name, double minAccuracy, int folds, params IEnumerable<Sentence>[] sentences)
Expand Down Expand Up @@ -86,33 +83,27 @@ public void CrossValidation(string name, double minAccuracy, int folds, params I
}

[Test]
public void Split_Morpho()
public void Analyzed1_TrainTest()
{
Split("morpho", 0.93, LoadAnalyzedCorpus(MorphoTrain), LoadAnalyzedCorpus(MorphoTest));
Split("analyzed1", 0.93, LoadAnalyzedCorpus(Analyzed1Train), LoadAnalyzedCorpus(Analyzed1Test));
}

[Test]
public void Split_Morpho2()
public void Analyzed2_TrainTest()
{
Split("morpho2", 0.93, LoadAnalyzedCorpus(Morpho2Train), LoadAnalyzedCorpus(Morpho2Test));
Split("analyzed2", 0.93, LoadAnalyzedCorpus(Analyzed2Train), LoadAnalyzedCorpus(Analyzed2Test));
}

[Test]
public void Split_Tagger()
public void LVTagger_TrainTest()
{
Split("tagger", 0.92, LoadUnanalyzedCorpus(TaggerTrain), LoadUnanalyzedCorpus(TaggerTest));
Split("lvtagger train/test", 0.92, LoadUnanalyzedCorpus(LVTaggerTrain), LoadUnanalyzedCorpus(LVTaggerTest));
}

[Test]
public void Split_Tagger2()
public void LVTagger_TrainTest_ValidOnly()
{
Split("tagger2", 0.92, LoadAnalyzedCorpus(Tagger2Train), LoadAnalyzedCorpus(Tagger2Test));
}

[Test]
public void Split_Tagger3()
{
Split("tagger3", 0.92, LoadUnanalyzedCorpus(Tagger3Train), LoadUnanalyzedCorpus(Tagger3Test));
Split("lvtagger train/test (correct possible tags only)", 0.92, LoadUnanalyzedCorpus(LVTaggerTrain, true), LoadUnanalyzedCorpus(LVTaggerTest, true));
}

public void Split(string name, double minAccuracy, Sentence[] train, Sentence[] test)
Expand Down Expand Up @@ -159,8 +150,8 @@ public void Split(string name, double minAccuracy, Sentence[] train, Sentence[]
[Test]
public void LoadSave()
{
string trainResource = Morpho2Train;
string testResource = Morpho2Test;
string trainResource = Analyzed2Train;
string testResource = Analyzed2Test;
double minAccuracy = 0.93;

Sentence[] train = LoadAnalyzedCorpus(trainResource);
Expand Down Expand Up @@ -216,8 +207,8 @@ public void Model_Latest()
[Test]
public void TagSpeed()
{
string trainResource = Morpho2Train;
string testResource = Morpho2Test;
string trainResource = Analyzed2Train;
string testResource = Analyzed2Test;
int maxTokenCount = 1000000;
double minAccuracy = 0.93;

Expand Down Expand Up @@ -278,7 +269,7 @@ private Sentence[] LoadAnalyzedCorpus(string resourceName)
return corpus.Load(stream).ToArray();
}

private Sentence[] LoadUnanalyzedCorpus(string resourceName)
private Sentence[] LoadUnanalyzedCorpus(string resourceName, bool ignoreIncorrect = false)
{
LuMiiCorpus corpus = new LuMiiCorpus();
LuMiiMorphology morphology = new LuMiiMorphology();
Expand All @@ -303,16 +294,13 @@ private Sentence[] LoadUnanalyzedCorpus(string resourceName)
Tag[] possibleTags = morphology.Analyze(token.TextTrueCase).ToArray();

if (!possibleTags.Any(t => t.Equals(token.CorrectTag)))
{
ignore = true;
break;
}

Token analyzedToken = new Token(token.TextTrueCase, possibleTags, token.CorrectTag, analyzedSentence);
analyzedSentence.Add(analyzedToken);
}

if (!ignore)
if (!ignoreIncorrect || !ignore)
{
goodSentences.Add(analyzedSentence);
}
Expand Down
Loading

0 comments on commit 4fa47d9

Please sign in to comment.