Skip to content

Commit

Permalink
Complete solution files.
Browse files Browse the repository at this point in the history
  • Loading branch information
mavidian committed Jun 10, 2019
1 parent 041282e commit 4772cef
Show file tree
Hide file tree
Showing 16 changed files with 391 additions and 1 deletion.
63 changes: 63 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
###############################################################################
# Set default behavior to automatically normalize line endings.
###############################################################################
* text=auto

###############################################################################
# Set default behavior for command prompt diff.
#
# This is need for earlier builds of msysgit that does not have it on by
# default for csharp files.
# Note: This is only used by command line
###############################################################################
#*.cs diff=csharp

###############################################################################
# Set the merge driver for project and solution files
#
# Merging from the command prompt will add diff markers to the files if there
# are conflicts (Merging from VS is not affected by the settings below, in VS
# the diff markers are never inserted). Diff markers may cause the following
# file extensions to fail to load in VS. An alternative would be to treat
# these files as binary and thus will always conflict and require user
# intervention with every merge. To do so, just uncomment the entries below
###############################################################################
#*.sln merge=binary
#*.csproj merge=binary
#*.vbproj merge=binary
#*.vcxproj merge=binary
#*.vcproj merge=binary
#*.dbproj merge=binary
#*.fsproj merge=binary
#*.lsproj merge=binary
#*.wixproj merge=binary
#*.modelproj merge=binary
#*.sqlproj merge=binary
#*.wwaproj merge=binary

###############################################################################
# behavior for image files
#
# image files are treated as binary by default.
###############################################################################
#*.jpg binary
#*.png binary
#*.gif binary

###############################################################################
# diff behavior for common document formats
#
# Convert binary document formats to text before diffing them. This feature
# is only available from the command line. Turn it on by uncommenting the
# entries below.
###############################################################################
#*.doc diff=astextplain
#*.DOC diff=astextplain
#*.docx diff=astextplain
#*.DOCX diff=astextplain
#*.dot diff=astextplain
#*.DOT diff=astextplain
#*.pdf diff=astextplain
#*.PDF diff=astextplain
#*.rtf diff=astextplain
#*.RTF diff=astextplain
30 changes: 30 additions & 0 deletions DataConveyer_AggregateTokens.sln
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio Version 16
VisualStudioVersion = 16.0.28803.452
MinimumVisualStudioVersion = 10.0.40219.1
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "DataConveyer_AggregateTokens", "DataConveyer_AggregateTokens\DataConveyer_AggregateTokens.csproj", "{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{8621A13C-0694-4941-BA8E-5E4191C25466}"
ProjectSection(SolutionItems) = preProject
README.md = README.md
EndProjectSection
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Release|Any CPU = Release|Any CPU
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Debug|Any CPU.Build.0 = Debug|Any CPU
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Release|Any CPU.ActiveCfg = Release|Any CPU
{23C7A24F-EBF7-4EB6-A5E8-5697B5371416}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {8A91F2F5-CE7F-4D13-BCCB-EB85DB2C3A2E}
EndGlobalSection
EndGlobal
7 changes: 7 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_01.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<Tokens>
<Token color="blue">5</Token>
<Token color="red">2</Token>
<Token color="white">11</Token>
<Token color="green">7</Token>
<Token color="orange">14</Token>
</Tokens>
9 changes: 9 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_02.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<Tokens>
<Token color="white">15</Token>
<Token color="purple">12</Token>
<Token color="yellow">3</Token>
<Token color="red">8</Token>
<Token color="brown">12</Token>
<Token color="green">4</Token>
<Token color="orange">16</Token>
</Tokens>
6 changes: 6 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_03.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<Tokens>
<Token color="black">4</Token>
<Token color="silver">18</Token>
<Token color="bronze">1</Token>
<Token color="gold">3</Token>
</Tokens>
8 changes: 8 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_04.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<Tokens>
<Token color="brown">2</Token>
<Token color="green">14</Token>
<Token color="black">7</Token>
<Token color="orange">5</Token>
<Token color="blue">5</Token>
<Token color="white">10</Token>
</Tokens>
14 changes: 14 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_05.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<Tokens>
<Token color="blue">12</Token>
<Token color="white">6</Token>
<Token color="green">17</Token>
<Token color="orange">9</Token>
<Token color="purple">16</Token>
<Token color="yellow">13</Token>
<Token color="red">18</Token>
<Token color="brown">11</Token>
<Token color="black">14</Token>
<Token color="gold">8</Token>
<Token color="silver">6</Token>
<Token color="bronze">2</Token>
</Tokens>
5 changes: 5 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_06.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<Tokens>
<Token color="purple">6</Token>
<Token color="yellow">3</Token>
<Token color="white">8</Token>
</Tokens>
6 changes: 6 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_07.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
<Tokens>
<Token color="yellow">4</Token>
<Token color="red">1</Token>
<Token color="brown">13</Token>
<Token color="silver">4</Token>
</Tokens>
4 changes: 4 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_08.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<Tokens>
<Token color="gold">12</Token>
<Token color="white">14</Token>
</Tokens>
7 changes: 7 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_09.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<Tokens>
<Token color="yellow">1</Token>
<Token color="white">9</Token>
<Token color="green">7</Token>
<Token color="gold">18</Token>
<Token color="silver">15</Token>
</Tokens>
7 changes: 7 additions & 0 deletions DataConveyer_AggregateTokens/Data/File_10.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<Tokens>
<Token color="orange">4</Token>
<Token color="purple">6</Token>
<Token color="yellow">6</Token>
<Token color="blue">11</Token>
<Token color="white">16</Token>
</Tokens>
12 changes: 12 additions & 0 deletions DataConveyer_AggregateTokens/DataConveyer_AggregateTokens.csproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>netcoreapp2.1</TargetFramework>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="DataConveyer" Version="3.0.1" />
</ItemGroup>

</Project>
116 changes: 116 additions & 0 deletions DataConveyer_AggregateTokens/FileProcessor.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
// Copyright © 2019 Mavidian Technologies Limited Liability Company. All Rights Reserved.

using Mavidian.DataConveyer.Common;
using Mavidian.DataConveyer.Entities.KeyVal;
using Mavidian.DataConveyer.Logging;
using Mavidian.DataConveyer.Orchestrators;
using System.Collections.Concurrent;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Threading.Tasks;

namespace DataConveyer_AggregateTokens
{
/// <summary>
/// Represents Data Conveyer functionality specific to aggregating values in a series of XML files.
/// </summary>
internal class FileProcessor
{
private readonly IOrchestrator Orchestrator;

private readonly string _inLocation;

internal FileProcessor(string inLocation, string outFile)
{
_inLocation = inLocation;

var config = new OrchestratorConfig()
//To facilitate troubleshooting logging, data can be sent to a DataConveyer.log file:
//var config = new OrchestratorConfig(LoggerCreator.CreateLogger(LoggerType.LogFile, "AggregateTokens", LogEntrySeverity.Information))
{
GlobalCacheElements = new string[] { "TokenSummary" }, //a single element - a dictionary - Dict<string,Tuple<int,int>>
InputDataKind = KindOfTextData.XML,
IntakeReaders = () => Directory.GetFiles(_inLocation, "*.xml").Select(f => File.OpenText(f)), //note that we're neglecting to dispose the stream readers here (not a production code)
XmlJsonIntakeSettings = "RecordNode|Token,IncludeExplicitText|true",
ExplicitTypeDefinitions = "__explicitText__|I", //in our case, explicit text in Token node contains integer value
ClusterMarker = (rec,pRec,n) => pRec == null ? true : rec.SourceNo != pRec.SourceNo, // each file (source) constitutes a cluster
MarkerStartsCluster = true, //predicate (marker) matches the first record in cluster
AppendFootCluster = true, // to contain summarized token data
AllowOnTheFlyInputFields = true,
ConcurrencyLevel = 4,
TransformerType = TransformerType.Universal,
UniversalTransformer = CumulateTokenData,
AllowTransformToAlterFields = true,
OutputDataKind = KindOfTextData.Delimited,
HeadersInFirstOutputRow = true,
OutputFileName = outFile
};

Orchestrator = OrchestratorCreator.GetEtlOrchestrator(config);
}

/// <summary>
/// Execute Data Conveyer process.
/// </summary>
/// <returns>Task containing the process results.</returns>
internal async Task<ProcessResult> ProcessFileAsync()
{
var result = await Orchestrator.ExecuteAsync();
Orchestrator.Dispose();

return result;
}


/// <summary>
/// Universal transformer to cumulate token data in global cache and remove cluster from output.
/// In case of foot cluster, obtain summary data from global cache and prepare output
/// </summary>
/// <param name="cluster"></param>
/// <returns>Nothing (i.e. clusters are filtered out), except for the foot cluster (which contains summary data).</returns>
private IEnumerable<ICluster> CumulateTokenData(ICluster cluster)
{
//A single element global cache containing a dictionary Dict<string,Tuple<int,int>>, where
// Key = color
// Value = tuple consisting of count and total (i.e. cumulated __explicitText__)
var gc = cluster.GlobalCache;

gc.ReplaceValue("TokenSummary", (ConcurrentDictionary<string, (int count, int total)> t) => t ?? new ConcurrentDictionary<string, (int count, int total)>()); //initialize the dictionary during the 1st pass

var tokenSummary = (ConcurrentDictionary<string, (int count, int total)>)gc["TokenSummary"];

if (cluster.StartRecNo == Constants.FootClusterRecNo)
{ //Foot cluster
//Note that Data Conveyer guarantees that foot cluster will be processed AFTER all other clusters have been processed.

//Prepare a single record foot cluster with summary data to output
foreach (var color in tokenSummary.Keys)
{
var footRec = cluster.ObtainEmptyRecord();
footRec.AddItem("Color", color);
(int count, int total) = tokenSummary[color];
footRec.AddItem("Count", count);
footRec.AddItem("Total", total);
footRec.AddItem("Average", string.Format("{0:#0.0}", (double)total / count));

cluster.AddRecord(footRec);
}

return Enumerable.Repeat(cluster, 1);
}

//Regular cluster - cumulate data in the global cache
foreach (var rec in cluster.Records)
{
var color = (string)rec["color"];
var value = (int)rec["__explicitText__"];

tokenSummary.AddOrUpdate(color, (1, value), (c, t) => (t.count + 1, t.total + value));
}

return Enumerable.Empty<ICluster>(); //no data from regular cluster is sent to output
}

}
}
54 changes: 54 additions & 0 deletions DataConveyer_AggregateTokens/Program.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
// Copyright © 2019 Mavidian Technologies Limited Liability Company. All Rights Reserved.

using Mavidian.DataConveyer.Common;
using Mavidian.DataConveyer.Orchestrators;
using System;
using System.Diagnostics;
using System.IO;

namespace DataConveyer_AggregateTokens
{
class Program
{
// Location of Data Conveyer input:
private const string InputFolder = @"..\..\..\Data";

static void Main()
{
var asmName = System.Reflection.Assembly.GetExecutingAssembly().GetName();
var inputLocation = Path.GetFullPath(InputFolder);
var outputFile = inputLocation + Path.DirectorySeparatorChar + "TokenAggregates.csv";
Console.WriteLine($"{asmName.Name} v{asmName.Version} started execution on {DateTime.Now:MM-dd-yyyy a\\t hh:mm:ss tt}");
Console.WriteLine($"DataConveyer library used: {ProductInfo.CurrentInfo.ToString()}");
Console.WriteLine();
Console.WriteLine("This application reads all XML files located in and input folder and aggregates tokens they contain.");
Console.WriteLine();
Console.WriteLine($"Input location : {inputLocation}");
Console.WriteLine($"Output file: {outputFile}");
Console.WriteLine();

Console.WriteLine("Hit any key to start.");
Console.ReadKey();

Console.WriteLine("Processing started...'.");

var processor = new FileProcessor(inputLocation, outputFile);

var stopWatch = new Stopwatch();
stopWatch.Start();
var result = processor.ProcessFileAsync().Result;
stopWatch.Stop();

if (result.CompletionStatus == CompletionStatus.IntakeDepleted)
{
Console.WriteLine($"Processing completed in {stopWatch.Elapsed.TotalSeconds.ToString("##0.000")}s; there were {result.RowsRead} tokens identified in {result.ClustersRead - 1} files."); // -1 for foot cluster
}
else Console.WriteLine($"Oops! Processing resulted in unexpected status of " + result.CompletionStatus.ToString());
Console.WriteLine();

Console.WriteLine("Hit Enter key to exit.");
Console.ReadLine();
}

}
}
Loading

0 comments on commit 4772cef

Please sign in to comment.