# mnist

[dotnet/machinelearning-samples](https://github.com/dotnet/machinelearning-samples/tree/master/samples/csharp/getting-started/MulticlassClassification_MNIST)を参考にmnistをML.NETでやってみる。

In [1]:
#r "nuget:Microsoft.ML.Vision, 1.4.0"

In [18]:
using Microsoft.ML;
using Microsoft.ML.Data;
using System;
using System.Net;
using System.IO;
using System.IO.Compression;
using Microsoft.ML.Transforms;

## データの準備

### ダウンロード

`System.Net.WebClient`を使ってMNISTデータをダウンロードする。

In [91]:
var urlBase = "http://archive.ics.uci.edu/ml/machine-learning-databases/optdigits/";
var downloadDirectory = "./data/";
var mnistFiles = new List<string>(){"optdigits.tra", "optdigits.tes"};

using(var wc = new WebClient())
{
    foreach(var one in mnistFiles)
    {
        var url = urlBase + one;
        var downloadPath = downloadDirectory + one;
        wc.DownloadFile(url, downloadPath);  
    }
}

### データクラスの定義

In [92]:
// Input
class InputData
{
    [ColumnName("PixelValues")]
    [VectorType(64)]
    public float[] PixelValues;

    [LoadColumn(64)]
    public float Number;
}
// Output
class OutPutData
{
    [ColumnName("Score")]
    public float[] Score;
}

display("[InputData]");
display(new InputData());
display("[OutPutData]");
display(new OutPutData());

[InputData]

PixelValues,Number
<null>,0


[OutPutData]

Score
<null>


`IDataView`にデータをロードする。`IDataView`はLINQでいうところの`IEnumerable<T>`に相当する、ML.NETのデータの中心となる存在。 [参考](https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.idataview?view=ml-dotnet)

データは65の列をもつ。1-64列はそれぞれ、32x32のビットマップを4x4のブロックによって64個（8x8）に分割したうちの1つを示し、0-16の整数値を持つ。この整数値は各4x4ブロックのうちONのピクセルの数を示す。65列目はラベル（0-9の数字）である。

In [139]:
MLContext mlContext = new MLContext();

var projectDir = Path.GetFullPath(Environment.CurrentDirectory);
var trainDataPath = Path.Combine(projectDir,"data", "optdigits.tra");
var testDataPath = Path.Combine(projectDir,"data", "optdigits.tes");

// 訓練データのロード
IDataView trainData = mlContext.Data.LoadFromTextFile(
    path: trainDataPath,
    columns : new[]
    {
        new TextLoader.Column(nameof(InputData.PixelValues), DataKind.Single, 0, 63), //データの0-63列は4x4ブロックのONピクセルの数
        new TextLoader.Column(nameof(InputData.Number), DataKind.Single, 64) //64列は数値
    },
    hasHeader : false,
    separatorChar : ','
    );

// テストデータのロード
IDataView testData = mlContext.Data.LoadFromTextFile(
    path: testDataPath,
    columns: new[]
    {
        new TextLoader.Column(nameof(InputData.PixelValues), DataKind.Single, 0, 63),
        new TextLoader.Column(nameof(InputData.Number), DataKind.Single, 64)
    },
    hasHeader: false,
    separatorChar: ','
    );

var trainList = mlContext.Data.CreateEnumerable<InputData>(trainData, reuseRowObject:false);
var testList = mlContext.Data.CreateEnumerable<InputData>(testData, reuseRowObject:false);
display("[TrainData]");
display(trainList.Take(5));
display("[TestData]");
display(testList.Take(5));

[TrainData]

index,PixelValues,Number
0,"[ 0, 1, 6, 15, 12, 1, 0, 0, 0, 7 ... (54 more) ]",0
1,"[ 0, 0, 10, 16, 6, 0, 0, 0, 0, 7 ... (54 more) ]",0
2,"[ 0, 0, 8, 15, 16, 13, 0, 0, 0, 1 ... (54 more) ]",7
3,"[ 0, 0, 0, 3, 11, 16, 0, 0, 0, 0 ... (54 more) ]",4
4,"[ 0, 0, 5, 14, 4, 0, 0, 0, 0, 0 ... (54 more) ]",6


[TestData]

index,PixelValues,Number
0,"[ 0, 0, 5, 13, 9, 1, 0, 0, 0, 0 ... (54 more) ]",0
1,"[ 0, 0, 0, 12, 13, 5, 0, 0, 0, 0 ... (54 more) ]",1
2,"[ 0, 0, 0, 4, 15, 12, 0, 0, 0, 0 ... (54 more) ]",2
3,"[ 0, 0, 7, 15, 13, 1, 0, 0, 0, 8 ... (54 more) ]",3
4,"[ 0, 0, 0, 1, 11, 0, 0, 0, 0, 0 ... (54 more) ]",4


In [135]:
var dataProcessPipeline = 
    mlContext.
    Transforms.
    Conversion.
    MapValueToKey(
        outputColumnName: "Label",  // 出力列名
        inputColumnName: "Number",  // 入力列名
        keyOrdinality: ValueToKeyMappingEstimator.KeyOrdinality.ByValue).
    Append(mlContext.Transforms.Concatenate("Features", nameof(InputData.PixelValues)).AppendCacheCheckpoint(mlContext));

// public class TransformedData
// {
//     public float[] Features { get; set; }
// }

// var processedData = dataProcessPipeline.Fit(trainData).Transform(trainData);
// var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(processedData, reuseRowObject: false);
// display(featuresColumn.First().Features);

var trainer = 
    mlContext.
    MulticlassClassification.
    Trainers.
    SdcaMaximumEntropy(labelColumnName: "Label", featureColumnName: "Features");

var trainingPipeline = 
    dataProcessPipeline.
    Append(trainer).
    Append(mlContext.Transforms.Conversion.MapKeyToValue("Number","Label"));

index,value
0,0
1,1
2,6
3,15
4,12
5,1
6,0
7,0
8,0
9,7


In [101]:
Console.WriteLine("=============== Training the model ===============");
ITransformer trainedModel = trainingPipeline.Fit(trainData);

Console.WriteLine("===== Evaluating Model's accuracy with Test data =====");
var predictions = trainedModel.Transform(testData);
var metrics = mlContext.MulticlassClassification.Evaluate(data:predictions, labelColumnName:"Number", scoreColumnName:"Score");

public static void PrintMultiClassClassificationMetrics(string name, MulticlassClassificationMetrics metrics)
{
    Console.WriteLine($"************************************************************");
    Console.WriteLine($"*    Metrics for {name} multi-class classification model   ");
    Console.WriteLine($"*-----------------------------------------------------------");
    Console.WriteLine($"    AccuracyMacro = {metrics.MacroAccuracy:0.####}, a value between 0 and 1, the closer to 1, the better");
    Console.WriteLine($"    AccuracyMicro = {metrics.MicroAccuracy:0.####}, a value between 0 and 1, the closer to 1, the better");
    Console.WriteLine($"    LogLoss = {metrics.LogLoss:0.####}, the closer to 0, the better");
    Console.WriteLine($"    LogLoss for class 1 = {metrics.PerClassLogLoss[0]:0.####}, the closer to 0, the better");
    Console.WriteLine($"    LogLoss for class 2 = {metrics.PerClassLogLoss[1]:0.####}, the closer to 0, the better");
    Console.WriteLine($"    LogLoss for class 3 = {metrics.PerClassLogLoss[2]:0.####}, the closer to 0, the better");
    Console.WriteLine($"************************************************************");
}

PrintMultiClassClassificationMetrics(trainer.ToString(), metrics);

===== Evaluating Model's accuracy with Test data =====
************************************************************
*    Metrics for Microsoft.ML.Trainers.SdcaMaximumEntropyMulticlassTrainer multi-class classification model   
*-----------------------------------------------------------
    AccuracyMacro = 0.9543, a value between 0 and 1, the closer to 1, the better
    AccuracyMicro = 0.9544, a value between 0 and 1, the closer to 1, the better
    LogLoss = 0.1806, the closer to 0, the better
    LogLoss for class 1 = 0.0785, the closer to 0, the better
    LogLoss for class 2 = 0.0504, the closer to 0, the better
    LogLoss for class 3 = 0.1554, the closer to 0, the better
************************************************************


In [89]:
private class InputData
{
    public float Feature1;
    [VectorType(3)]
    public float[] Feature2;
    public int Feature3;
}
private class TransformedData
{
    public float[] Features { get; set; }
}

var mlContext = new MLContext();
var samples = new List<InputData>()
{
    new InputData(){ Feature1 = 0.1f, Feature2 = new[]{ 1.1f, 2.1f,3.1f }, Feature3 = 1 },
    new InputData(){ Feature1 = 0.2f, Feature2 = new[]{ 1.2f, 2.2f,3.2f }, Feature3 = 2 },
};

var dataview = mlContext.Data.LoadFromEnumerable(samples);
var pipeline = 
    mlContext.
    Transforms.
    Conversion.
    ConvertType("Feature3", outputKind: DataKind.Single).
    Append(mlContext.Transforms.Concatenate("Features", new[]{ "Feature1", "Feature2", "Feature3" }));

var transformedData = pipeline.Fit(dataview).Transform(dataview);
var featuresColumn = mlContext.Data.CreateEnumerable<TransformedData>(transformedData, reuseRowObject: false);

foreach (var featureRow in featuresColumn)
{
    display(featureRow.Features);
    //Console.WriteLine(string.Join(" ", featureRow.Features));
}

index,value
0,0.1
1,1.1
2,2.1
3,3.1
4,1.0


index,value
0,0.2
1,1.2
2,2.2
3,3.2
4,2.0
