In [1]:
using System;
using System.IO;
using System.Linq;

In [2]:
var solutionPath = Directory.GetParent("..").ToString();
var dataPath = Path.Join(solutionPath,"03-SparkMLNET","output");
var dailyCountsPath = Path.Join(dataPath,"dailycounts");
var totalCountsPath = Path.Join(dataPath,"totalcounts");

Console.WriteLine(dailyCountsPath);
Console.WriteLine(totalCountsPath);

C:\Users\lqdev\Development\Presentations\dotnetdataml042020\code\03-SparkMLNET\output\dailycounts
C:\Users\lqdev\Development\Presentations\dotnetdataml042020\code\03-SparkMLNET\output\totalcounts


In [3]:
class Counter
{
    public string Id {get;set;}
    public int TotalEntries {get;set;}
}

In [4]:
var counters = 
    Directory.GetFiles(totalCountsPath)
        .Where(path => Path.GetExtension(path).Contains("csv"))
        .SelectMany(path =>
        {
            var fileContents = File.ReadAllLines(path);
            return fileContents.Select(line => 
            {
                var data = line.Split(',');
                return new Counter{Id=data[0],TotalEntries=Int32.Parse(data[1])};
            });
        })
        .OrderByDescending(counter => counter.TotalEntries);

display(counters.Take(10));

index,Id,TotalEntries
0,101051865,2442
1,102051865,2442
2,100051865,2442
3,102009427,2296
4,100009427,2296
5,102009428,2296
6,101009427,2296
7,101009428,2296
8,100009426,2295
9,101009426,2295


In [5]:
#r "nuget:XPlot.Plotly"

In [6]:
using XPlot.Plotly;

In [7]:
var counterDataHistogram = 
    Chart.Plot(
        new Graph.Histogram {
            x = counters.Select(counter => counter.TotalEntries)
        });
display(counterDataHistogram);

In [8]:
var counterDataChart = 
    Chart.Plot(
        new Graph.Bar
        {
            //x = counters.Select(counter => counter.Id),
            y = counters.Select(counter => counter.TotalEntries),
            text = counters.Select(counter => counter.Id)
        },new Layout.Layout
        {
            xaxis = new Graph.Xaxis { tickangle=-45}
        });
        
display(counterDataChart);

In [9]:
class CounterData
{
    public string Id {get;set;}
    public DateTime Day {get;set;}
    public float DailyCount {get;set;}
    public float Latitude {get;set;}
    public float Longitude {get;set;}
}

In [10]:
var topCounters = counters.Where(counter => counter.TotalEntries > 1000).ToList();
var topCounter = topCounters.First();
var bottomCounter = topCounters.Last();
display(topCounter);
display(bottomCounter);

Id,TotalEntries
101051865,2442


Id,TotalEntries
100010019,1211


In [11]:
var counterData = 
    Directory.GetFiles(dailyCountsPath)
    .Where(path => Path.GetExtension(path).Contains("csv"))
    .SelectMany(path => 
    {
        var lines = File.ReadAllLines(path);
        return lines.Select(line => 
        {
            var data = line.Split(',');
            return new CounterData
            {
                Id=data[0],
                Day=DateTime.ParseExact(data[1],"yyyy-MM-dd",null),
                DailyCount=float.Parse(data[2]),
                Latitude=float.Parse(data[3]),
                Longitude=float.Parse(data[4])
            };
        });
    })
    .Where(counter => topCounters.Any(x => counter.Id == x.Id));

Console.WriteLine(counterData.Count())

62932


In [12]:
var topCounterData = 
    counterData.Where(counter => counter.Id == topCounter.Id).ToList();

display(topCounterData.First());
display(topCounterData.Last());

Id,Day,DailyCount,Latitude,Longitude
101051865,2012-08-30 00:00:00Z,213,40.69981,-73.98589


Id,Day,DailyCount,Latitude,Longitude
101051865,2019-05-07 00:00:00Z,3115,40.69981,-73.98589


In [13]:
var bottomCounterData = 
    counterData.Where(counter => counter.Id == bottomCounter.Id).ToList();

display(bottomCounterData.First());
display(bottomCounterData.Last());

Id,Day,DailyCount,Latitude,Longitude
100010019,2016-11-21 00:00:00Z,0,40.72096,-73.96093


Id,Day,DailyCount,Latitude,Longitude
100010019,2020-03-15 00:00:00Z,2604,40.72096,-73.96093


In [14]:
var firstCounter = new Graph.Scatter
{
    x = topCounterData.Select(x => x.Day),
    y = topCounterData.Select(x => x.DailyCount),
    name = topCounterData.First().Id
};

var lastCounter = new Graph.Scatter
{
    x = bottomCounterData.Select(x => x.Day),
    y = bottomCounterData.Select(x => x.DailyCount),
    name = bottomCounterData.First().Id
};

var myChart = Chart.Plot(new Graph.Scatter[] { firstCounter, lastCounter });
display(myChart)

In [15]:
var trainingData = 
    bottomCounterData
        .Where(x => x.Day > new DateTime(2018,12,31) && x.Day < new DateTime(2020,1,1));

var evaluationData = 
    bottomCounterData
        .Where(x => x.Day > new DateTime(2019,12,31));
        
Console.WriteLine($"Training data size: {trainingData.Count()}");
Console.WriteLine($"Evaluation data size: {evaluationData.Count()}");

Training data size: 365
Evaluation data size: 75


In [16]:
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.TimeSeries"

In [17]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.TimeSeries;

In [18]:
MLContext ctx = new MLContext();

In [19]:
IDataView trainingDv = ctx.Data.LoadFromEnumerable(trainingData);
IDataView evaluationDv = ctx.Data.LoadFromEnumerable(evaluationData);

In [20]:
var forecastingPipeline = ctx.Forecasting.ForecastBySsa(
    outputColumnName: "ForecastedCount",
    inputColumnName: "DailyCount",
    windowSize: 7,
    seriesLength: 30,
    trainSize: trainingData.Count(),
    horizon: 7,
    confidenceLevel: 0.95f,
    confidenceLowerBoundColumn: "LowerBoundCount",
    confidenceUpperBoundColumn: "UpperBoundCount");

In [21]:
var model = forecastingPipeline.Fit(trainingDv);

In [22]:
IDataView evaluationForecast = model.Transform(evaluationDv);

In [23]:
var dates = evaluationForecast.GetColumn<DateTime>("Day");
var actual = evaluationForecast.GetColumn<float>("DailyCount");

In [24]:
var forecast = evaluationForecast.GetColumn<float[]>("ForecastedCount").Select(forecast => forecast[0]);
var lowerBound = evaluationForecast.GetColumn<float[]>("LowerBoundCount").Select(forecast => Math.Max(0,forecast[0]));
var upperBound = evaluationForecast.GetColumn<float[]>("UpperBoundCount").Select(forecast => forecast[0]);

In [25]:
Console.WriteLine($"Actual Count: {actual.Count()}");
Console.WriteLine($"Forecast Count: {forecast.Count()}");

Actual Count: 75
Forecast Count: 75


In [26]:
var comparisonChart = 
    Chart.Plot(new Graph.Scatter[] {
        new Graph.Scatter 
        {
            x=dates,
            y=actual,
            name="Actual"
        },
        new Graph.Scatter 
        {
            x=dates,
            y=forecast,
            name="Forecast"
        },
        new Graph.Scatter 
        {
            x=dates,
            y=lowerBound,
            name="Lower Bound"
        },
        new Graph.Scatter 
        {
            x=dates,
            y=upperBound,
            name="Upper Bound"
        }});

display(comparisonChart);

In [27]:
var metrics = actual.Zip(forecast, (actualValue, forecastValue) => actualValue - forecastValue);
var MAE = metrics.Average(error => Math.Abs(error)); // Mean Absolute Error
var RMSE = Math.Sqrt(metrics.Average(error => Math.Pow(error, 2))); // Root Mean Squared Error
Console.WriteLine($"MAE:{MAE} | RMSE:{RMSE}");

MAE:289.70346 | RMSE:390.325856145608


In [28]:
class CounterForecast
{
    public float DailyCount {get;set;}
    public float[] ForecastedCount {get;set;}
    public float[] LowerBoundCount {get;set;}
    public float[] UpperBoundCount {get;set;}
}

In [29]:
var predEngine = model.CreateTimeSeriesEngine<CounterData,CounterForecast>(ctx);

In [30]:
//Forecast Data
var forecastInput = evaluationData.OrderByDescending(x => x.Day).Take(7).First();
forecastInput

Id,Day,DailyCount,Latitude,Longitude
100010019,2020-03-15 00:00:00Z,2604,40.72096,-73.96093


In [31]:
var futureForecast = predEngine.Predict(forecastInput);
futureForecast

DailyCount,ForecastedCount,LowerBoundCount,UpperBoundCount
2604,"[ 2520.617, 2535.3936, 2532.2236, 2525.2344, 2522.3835, 2520.4834, 2515.917 ]","[ 1388.4159, 1361.5823, 1310.5896, 1258.9706, 1218.3303, 1175.3291, 1132.293 ]","[ 3652.818, 3709.2048, 3753.8577, 3791.4983, 3826.4368, 3865.6377, 3899.541 ]"


In [32]:
var futureDates = Enumerable.Range(1,7).Select(x => forecastInput.Day.AddDays(x));

var futureForecastPlot = 
    Chart.Plot(
        new Graph.Scatter []
        {
            new Graph.Scatter
            {
                x = futureDates,
                y = futureForecast.ForecastedCount,
                name = "Forecast"
            },
            new Graph.Scatter
            {
                x = futureDates,
                y = futureForecast.LowerBoundCount,
                name = "Lower Bound"
            },
            new Graph.Scatter
            {
                x = futureDates,
                y = futureForecast.UpperBoundCount,
                name = "Upper Bound"
            }
        }
    );

display(futureForecastPlot)