-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
16 changed files
with
391 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
############################################################################### | ||
# Set default behavior to automatically normalize line endings. | ||
############################################################################### | ||
* text=auto | ||
|
||
############################################################################### | ||
# Set default behavior for command prompt diff. | ||
# | ||
# This is need for earlier builds of msysgit that does not have it on by | ||
# default for csharp files. | ||
# Note: This is only used by command line | ||
############################################################################### | ||
#*.cs diff=csharp | ||
|
||
############################################################################### | ||
# Set the merge driver for project and solution files | ||
# | ||
# Merging from the command prompt will add diff markers to the files if there | ||
# are conflicts (Merging from VS is not affected by the settings below, in VS | ||
# the diff markers are never inserted). Diff markers may cause the following | ||
# file extensions to fail to load in VS. An alternative would be to treat | ||
# these files as binary and thus will always conflict and require user | ||
# intervention with every merge. To do so, just uncomment the entries below | ||
############################################################################### | ||
#*.sln merge=binary | ||
#*.csproj merge=binary | ||
#*.vbproj merge=binary | ||
#*.vcxproj merge=binary | ||
#*.vcproj merge=binary | ||
#*.dbproj merge=binary | ||
#*.fsproj merge=binary | ||
#*.lsproj merge=binary | ||
#*.wixproj merge=binary | ||
#*.modelproj merge=binary | ||
#*.sqlproj merge=binary | ||
#*.wwaproj merge=binary | ||
|
||
############################################################################### | ||
# behavior for image files | ||
# | ||
# image files are treated as binary by default. | ||
############################################################################### | ||
#*.jpg binary | ||
#*.png binary | ||
#*.gif binary | ||
|
||
############################################################################### | ||
# diff behavior for common document formats | ||
# | ||
# Convert binary document formats to text before diffing them. This feature | ||
# is only available from the command line. Turn it on by uncommenting the | ||
# entries below. | ||
############################################################################### | ||
#*.doc diff=astextplain | ||
#*.DOC diff=astextplain | ||
#*.docx diff=astextplain | ||
#*.DOCX diff=astextplain | ||
#*.dot diff=astextplain | ||
#*.DOT diff=astextplain | ||
#*.pdf diff=astextplain | ||
#*.PDF diff=astextplain | ||
#*.rtf diff=astextplain | ||
#*.RTF diff=astextplain |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
|
||
Microsoft Visual Studio Solution File, Format Version 12.00 | ||
# Visual Studio Version 16 | ||
VisualStudioVersion = 16.0.28803.452 | ||
MinimumVisualStudioVersion = 10.0.40219.1 | ||
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DataConveyer_AggregateTokens", "DataConveyer_AggregateTokens\DataConveyer_AggregateTokens.csproj", "{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}" | ||
EndProject | ||
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8621A13C-0694-4941-BA8E-5E4191C25466}" | ||
ProjectSection(SolutionItems) = preProject | ||
README.md = README.md | ||
EndProjectSection | ||
EndProject | ||
Global | ||
GlobalSection(SolutionConfigurationPlatforms) = preSolution | ||
Debug|Any CPU = Debug|Any CPU | ||
Release|Any CPU = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(ProjectConfigurationPlatforms) = postSolution | ||
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Debug|Any CPU.ActiveCfg = Debug|Any CPU | ||
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Debug|Any CPU.Build.0 = Debug|Any CPU | ||
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Release|Any CPU.ActiveCfg = Release|Any CPU | ||
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Release|Any CPU.Build.0 = Release|Any CPU | ||
EndGlobalSection | ||
GlobalSection(SolutionProperties) = preSolution | ||
HideSolutionNode = FALSE | ||
EndGlobalSection | ||
GlobalSection(ExtensibilityGlobals) = postSolution | ||
SolutionGuid = {8A91F2F5-CE7F-4D13-BCCB-EB85DB2C3A2E} | ||
EndGlobalSection | ||
EndGlobal |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<Tokens> | ||
<Token color="blue">5</Token> | ||
<Token color="red">2</Token> | ||
<Token color="white">11</Token> | ||
<Token color="green">7</Token> | ||
<Token color="orange">14</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<Tokens> | ||
<Token color="white">15</Token> | ||
<Token color="purple">12</Token> | ||
<Token color="yellow">3</Token> | ||
<Token color="red">8</Token> | ||
<Token color="brown">12</Token> | ||
<Token color="green">4</Token> | ||
<Token color="orange">16</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
<Tokens> | ||
<Token color="black">4</Token> | ||
<Token color="silver">18</Token> | ||
<Token color="bronze">1</Token> | ||
<Token color="gold">3</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<Tokens> | ||
<Token color="brown">2</Token> | ||
<Token color="green">14</Token> | ||
<Token color="black">7</Token> | ||
<Token color="orange">5</Token> | ||
<Token color="blue">5</Token> | ||
<Token color="white">10</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<Tokens> | ||
<Token color="blue">12</Token> | ||
<Token color="white">6</Token> | ||
<Token color="green">17</Token> | ||
<Token color="orange">9</Token> | ||
<Token color="purple">16</Token> | ||
<Token color="yellow">13</Token> | ||
<Token color="red">18</Token> | ||
<Token color="brown">11</Token> | ||
<Token color="black">14</Token> | ||
<Token color="gold">8</Token> | ||
<Token color="silver">6</Token> | ||
<Token color="bronze">2</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
<Tokens> | ||
<Token color="purple">6</Token> | ||
<Token color="yellow">3</Token> | ||
<Token color="white">8</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
<Tokens> | ||
<Token color="yellow">4</Token> | ||
<Token color="red">1</Token> | ||
<Token color="brown">13</Token> | ||
<Token color="silver">4</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
<Tokens> | ||
<Token color="gold">12</Token> | ||
<Token color="white">14</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<Tokens> | ||
<Token color="yellow">1</Token> | ||
<Token color="white">9</Token> | ||
<Token color="green">7</Token> | ||
<Token color="gold">18</Token> | ||
<Token color="silver">15</Token> | ||
</Tokens> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<Tokens> | ||
<Token color="orange">4</Token> | ||
<Token color="purple">6</Token> | ||
<Token color="yellow">6</Token> | ||
<Token color="blue">11</Token> | ||
<Token color="white">16</Token> | ||
</Tokens> |
12 changes: 12 additions & 0 deletions
12
DataConveyer_AggregateTokens/DataConveyer_AggregateTokens.csproj
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<OutputType>Exe</OutputType> | ||
<TargetFramework>netcoreapp2.1</TargetFramework> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="DataConveyer" Version="3.0.1" /> | ||
</ItemGroup> | ||
|
||
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// Copyright © 2019 Mavidian Technologies Limited Liability Company. All Rights Reserved. | ||
|
||
using Mavidian.DataConveyer.Common; | ||
using Mavidian.DataConveyer.Entities.KeyVal; | ||
using Mavidian.DataConveyer.Logging; | ||
using Mavidian.DataConveyer.Orchestrators; | ||
using System.Collections.Concurrent; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Threading.Tasks; | ||
|
||
namespace DataConveyer_AggregateTokens | ||
{ | ||
/// <summary> | ||
/// Represents Data Conveyer functionality specific to aggregating values in a series of XML files. | ||
/// </summary> | ||
internal class FileProcessor | ||
{ | ||
private readonly IOrchestrator Orchestrator; | ||
|
||
private readonly string _inLocation; | ||
|
||
internal FileProcessor(string inLocation, string outFile) | ||
{ | ||
_inLocation = inLocation; | ||
|
||
var config = new OrchestratorConfig() | ||
//To facilitate troubleshooting logging, data can be sent to a DataConveyer.log file: | ||
//var config = new OrchestratorConfig(LoggerCreator.CreateLogger(LoggerType.LogFile, "AggregateTokens", LogEntrySeverity.Information)) | ||
{ | ||
GlobalCacheElements = new string[] { "TokenSummary" }, //a single element - a dictionary - Dict<string,Tuple<int,int>> | ||
InputDataKind = KindOfTextData.XML, | ||
IntakeReaders = () => Directory.GetFiles(_inLocation, "*.xml").Select(f => File.OpenText(f)), //note that we're neglecting to dispose the stream readers here (not a production code) | ||
XmlJsonIntakeSettings = "RecordNode|Token,IncludeExplicitText|true", | ||
ExplicitTypeDefinitions = "__explicitText__|I", //in our case, explicit text in Token node contains integer value | ||
ClusterMarker = (rec,pRec,n) => pRec == null ? true : rec.SourceNo != pRec.SourceNo, // each file (source) constitutes a cluster | ||
MarkerStartsCluster = true, //predicate (marker) matches the first record in cluster | ||
AppendFootCluster = true, // to contain summarized token data | ||
AllowOnTheFlyInputFields = true, | ||
ConcurrencyLevel = 4, | ||
TransformerType = TransformerType.Universal, | ||
UniversalTransformer = CumulateTokenData, | ||
AllowTransformToAlterFields = true, | ||
OutputDataKind = KindOfTextData.Delimited, | ||
HeadersInFirstOutputRow = true, | ||
OutputFileName = outFile | ||
}; | ||
|
||
Orchestrator = OrchestratorCreator.GetEtlOrchestrator(config); | ||
} | ||
|
||
/// <summary> | ||
/// Execute Data Conveyer process. | ||
/// </summary> | ||
/// <returns>Task containing the process results.</returns> | ||
internal async Task<ProcessResult> ProcessFileAsync() | ||
{ | ||
var result = await Orchestrator.ExecuteAsync(); | ||
Orchestrator.Dispose(); | ||
|
||
return result; | ||
} | ||
|
||
|
||
/// <summary> | ||
/// Universal transformer to cumulate token data in global cache and remove cluster from output. | ||
/// In case of foot cluster, obtain summary data from global cache and prepare output | ||
/// </summary> | ||
/// <param name="cluster"></param> | ||
/// <returns>Nothing (i.e. clusters are filtered out), except for the foot cluster (which contains summary data).</returns> | ||
private IEnumerable<ICluster> CumulateTokenData(ICluster cluster) | ||
{ | ||
//A single element global cache containing a dictionary Dict<string,Tuple<int,int>>, where | ||
// Key = color | ||
// Value = tuple consisting of count and total (i.e. cumulated __explicitText__) | ||
var gc = cluster.GlobalCache; | ||
|
||
gc.ReplaceValue("TokenSummary", (ConcurrentDictionary<string, (int count, int total)> t) => t ?? new ConcurrentDictionary<string, (int count, int total)>()); //initialize the dictionary during the 1st pass | ||
|
||
var tokenSummary = (ConcurrentDictionary<string, (int count, int total)>)gc["TokenSummary"]; | ||
|
||
if (cluster.StartRecNo == Constants.FootClusterRecNo) | ||
{ //Foot cluster | ||
//Note that Data Conveyer guarantees that foot cluster will be processed AFTER all other clusters have been processed. | ||
|
||
//Prepare a single record foot cluster with summary data to output | ||
foreach (var color in tokenSummary.Keys) | ||
{ | ||
var footRec = cluster.ObtainEmptyRecord(); | ||
footRec.AddItem("Color", color); | ||
(int count, int total) = tokenSummary[color]; | ||
footRec.AddItem("Count", count); | ||
footRec.AddItem("Total", total); | ||
footRec.AddItem("Average", string.Format("{0:#0.0}", (double)total / count)); | ||
|
||
cluster.AddRecord(footRec); | ||
} | ||
|
||
return Enumerable.Repeat(cluster, 1); | ||
} | ||
|
||
//Regular cluster - cumulate data in the global cache | ||
foreach (var rec in cluster.Records) | ||
{ | ||
var color = (string)rec["color"]; | ||
var value = (int)rec["__explicitText__"]; | ||
|
||
tokenSummary.AddOrUpdate(color, (1, value), (c, t) => (t.count + 1, t.total + value)); | ||
} | ||
|
||
return Enumerable.Empty<ICluster>(); //no data from regular cluster is sent to output | ||
} | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
// Copyright © 2019 Mavidian Technologies Limited Liability Company. All Rights Reserved. | ||
|
||
using Mavidian.DataConveyer.Common; | ||
using Mavidian.DataConveyer.Orchestrators; | ||
using System; | ||
using System.Diagnostics; | ||
using System.IO; | ||
|
||
namespace DataConveyer_AggregateTokens | ||
{ | ||
class Program | ||
{ | ||
// Location of Data Conveyer input: | ||
private const string InputFolder = @"..\..\..\Data"; | ||
|
||
static void Main() | ||
{ | ||
var asmName = System.Reflection.Assembly.GetExecutingAssembly().GetName(); | ||
var inputLocation = Path.GetFullPath(InputFolder); | ||
var outputFile = inputLocation + Path.DirectorySeparatorChar + "TokenAggregates.csv"; | ||
Console.WriteLine($"{asmName.Name} v{asmName.Version} started execution on {DateTime.Now:MM-dd-yyyy a\\t hh:mm:ss tt}"); | ||
Console.WriteLine($"DataConveyer library used: {ProductInfo.CurrentInfo.ToString()}"); | ||
Console.WriteLine(); | ||
Console.WriteLine("This application reads all XML files located in and input folder and aggregates tokens they contain."); | ||
Console.WriteLine(); | ||
Console.WriteLine($"Input location : {inputLocation}"); | ||
Console.WriteLine($"Output file: {outputFile}"); | ||
Console.WriteLine(); | ||
|
||
Console.WriteLine("Hit any key to start."); | ||
Console.ReadKey(); | ||
|
||
Console.WriteLine("Processing started...'."); | ||
|
||
var processor = new FileProcessor(inputLocation, outputFile); | ||
|
||
var stopWatch = new Stopwatch(); | ||
stopWatch.Start(); | ||
var result = processor.ProcessFileAsync().Result; | ||
stopWatch.Stop(); | ||
|
||
if (result.CompletionStatus == CompletionStatus.IntakeDepleted) | ||
{ | ||
Console.WriteLine($"Processing completed in {stopWatch.Elapsed.TotalSeconds.ToString("##0.000")}s; there were {result.RowsRead} tokens identified in {result.ClustersRead - 1} files."); // -1 for foot cluster | ||
} | ||
else Console.WriteLine($"Oops! Processing resulted in unexpected status of " + result.CompletionStatus.ToString()); | ||
Console.WriteLine(); | ||
|
||
Console.WriteLine("Hit Enter key to exit."); | ||
Console.ReadLine(); | ||
} | ||
|
||
} | ||
} |
Oops, something went wrong.