# Statistics for Performance Engineers I: Descriptive Statistics and Visualization

The following is the accompanying notebook for my learning paper: "Statistics For Performance Engineers I: Descriptive Statistics and Visualization".

In [None]:
#r "nuget: Newtonsoft.Json"
#r "nuget: XPlot.Plotly"
#r "nuget: XPlot.Plotly.Interactive"
#r "nuget: BenchmarkDotNet"
#r "nuget: MathNet.Numerics"

using Newtonsoft.Json;
using XPlot.Plotly;
using System.IO;
using MathNet.Numerics;
using MathNet.Numerics.Distributions;
using MathNet.Numerics.Statistics;

Loading extensions from `XPlot.Plotly.Interactive.dll`

Configuring PowerShell Kernel for XPlot.Plotly integration.

Installed support for XPlot.Plotly.

## Why Should A Performance Engineer Learn About Statistics?

In [None]:
public class ErrorY
{
    public string type { get; set; }
    public List<double> array { get; set; }
    public bool visible { get; set; }
}

public class BenchmarkData 
{
    public List<DateTime> x { get; set; }
    public List<double> y { get; set; }
    public List<string> gitHashruntime { get; set; }
    public string name { get; set; }
    public ErrorY error_y { get; set; }
    public List<string> perfRepoHash { get; set; }
    public string visible { get; set; }
}

In [None]:
List<BenchmarkData> doubleMax = JsonConvert.DeserializeObject<List<BenchmarkData>>(File.ReadAllText(Path.Combine("./Data", "Double.Max_Windows10_x64.json")));
BenchmarkData benchmarkResults = doubleMax[0];

### 1. Comparing a Real World vs. Theoretical Distributions

In [None]:
List<double> y = benchmarkResults.y;
IEnumerable<double> normalizedY = y.Select(d => ( d - y.Average() ) / y.StandardDeviation() );

var layout = new Layout.Layout
{
    title = "Comparing A Real World Distribution with the Normal Distribution"
};

var doubleMaxHistogram = new XPlot.Plotly.Histogram
{
    x = normalizedY,
    name = "ZScore Normalized Benchmark: Double.Max" 
};

var normalHistogram = new XPlot.Plotly.Histogram
{
    x = Normal.Samples(mean: 0, stddev: 1).Take(1000),
    name = "Standard Normal Distribution"
};

Chart.Plot(new [] { doubleMaxHistogram, normalHistogram }, layout)

### 2. Knowing when you need more data, or if your experiment needs to be changed 

In [None]:
var lowSampleSizeNormalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10);

var layout = new Layout.Layout
{
    title = "Low Sample Size Normal Distribution (n = 10)"
};

var lowSampleSizeNormalHistogram = new XPlot.Plotly.Histogram
{
    x = lowSampleSizeNormalDistribution, 
};

Console.WriteLine($"Standard Error: {lowSampleSizeNormalDistribution.StandardDeviation() / Math.Sqrt(lowSampleSizeNormalDistribution.Count())}");
Chart.Plot(lowSampleSizeNormalHistogram, layout)

Standard Error: 0.26456271316902447


In [None]:
var highSampleSizeNormalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10000);

var layout = new Layout.Layout
{
    title = "High Sample Size Normal Distribution (n = 10,000)"
};

var highSampleSizeNormalHistogram = new XPlot.Plotly.Histogram
{
    x = highSampleSizeNormalDistribution, 
};

Console.WriteLine($"Standard Error: {highSampleSizeNormalDistribution.StandardDeviation() / Math.Sqrt(highSampleSizeNormalDistribution.Count())}");
Chart.Plot(highSampleSizeNormalHistogram, layout)

Standard Error: 0.010028870777494415


## Descriptive Statistics

### Sample Size

In [None]:
List<(double mean, double sampleSize)> data = new();
int n = 1000;

for(int i = 1; i < n; i++)
{
    var samples = Normal.Samples(mean: 0, stddev: 1).Take(i);
    data.Add((mean: samples.Mean(), sampleSize: i));
}

var layout = new Layout.Layout
{
    title = "How Sample Size Increases Affect the Mean",
    xaxis = new Xaxis { title = "Number of Samples (n)" },
    yaxis = new Yaxis { title = "Mean" }
};

var scatter = new Scatter
{
    x = data.Select(d => d.sampleSize), 
    y = data.Select(d => d.mean),
};

Chart.Plot(scatter, layout)

### Minimum, Maxmium and Range

In [None]:
var layout = new Layout.Layout
{
    title = "How Variance Affects Min., Max. and the Range",
    xaxis = new Xaxis { title = "Number of Samples (n)" },
    yaxis = new Yaxis { title = "Mean" }
};

var scatter = new Scatter
{
    x = data.Select(d => d.sampleSize), 
    y = data.Select(d => d.mean),
};

Chart.Plot(scatter, layout)

### 5 Number Summary via a BoxPlot

In [None]:
var normalDistribution1 = Normal.Samples(mean: 0, stddev: 1).Take(10000).ToList();
var normalDistribution2 = Normal.Samples(mean: 4, stddev: 1).Take(10000);

var layout = new Layout.Layout
{
    title = "Box Plots"
};

var box1 = new Box
{
    y = normalDistribution1,
    name = "Box Plot 1"
};

var box2 = new Box
{
    y = normalDistribution2,
    name = "Box Plot 2"
};

Chart.Plot(new [] { box1, box2 }, layout)

### Mode

#### Unimodal

In [None]:
var normalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10000);

var layout = new Layout.Layout
{
    title = "Unimodal Distribution"
};

var unimodal = new XPlot.Plotly.Histogram
{
    x = normalDistribution, 
};

Chart.Plot(unimodal, layout)

#### Bimodal

In [None]:
var normalDistribution1 = Normal.Samples(mean: 0, stddev: 1).Take(10000).ToList();
var normalDistribution2 = Normal.Samples(mean: 4, stddev: 1).Take(10000);
normalDistribution1.AddRange(normalDistribution2);

var layout = new Layout.Layout
{
    title = "Bimodal Distribution"
};

var bimodal = new XPlot.Plotly.Histogram
{
    x = normalDistribution1, 
};

Chart.Plot(bimodal, layout)

#### Multimodal

In [None]:
var normalDistribution1 = Normal.Samples(mean: 0, stddev: 1).Take(10000).ToList();
var normalDistribution2 = Normal.Samples(mean: 4, stddev: 1).Take(10000);
var normalDistribution3 = Normal.Samples(mean: -4, stddev: 1).Take(10000);
normalDistribution1.AddRange(normalDistribution2);
normalDistribution1.AddRange(normalDistribution3);

var layout = new Layout.Layout
{
    title = "Multimodal Distribution"
};

var bimodal = new XPlot.Plotly.Histogram
{
    x = normalDistribution1, 
};

Chart.Plot(bimodal, layout)

### Standard Deviation and Variance

In [None]:
var highSpread = Normal.Samples(mean: 0, stddev: 10).Take(10_000);
var lowSpread  = Normal.Samples(mean: 0, stddev: 0.1).Take(10_000);

var layout = new Layout.Layout
{
    title = "High Spread vs. Low Spread Distribution",
    barmode = "stack"
};

var highSpreadHistogram = new XPlot.Plotly.Histogram
{
    x = highSpread,
    name = "High Spread"
};

var lowSpreadHistogram = new XPlot.Plotly.Histogram
{
    x = lowSpread, 
    name = "Low Spread"
};


Chart.Plot(new [] { highSpreadHistogram, lowSpreadHistogram }, layout)

### Skewness

#### Right Skewness

In [None]:
var normalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10000);
var rightSkewed = normalDistribution.Select(d => d * d);

var layout = new Layout.Layout
{
    title = "Right Skewed Distribution"
};

var rightSkewedHistogram = new XPlot.Plotly.Histogram
{
    x = rightSkewed, 
};

Chart.Plot(rightSkewedHistogram, layout)

#### Left Skewness

In [None]:
var normalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10000);
var leftSkew = normalDistribution.Select(d => Math.Log(d));

var layout = new Layout.Layout
{
    title = "Left Skewed Distribution"
};

var leftSkewedHistogram = new XPlot.Plotly.Histogram
{
    x = leftSkew, 
};

Chart.Plot(leftSkewedHistogram, layout)

#### Zero Skewness

In [None]:
var normalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(10000);

var layout = new Layout.Layout
{
    title = "Zero Skewed Distribution"
};

var zeroSkewedDistribution = new XPlot.Plotly.Histogram
{
    x = normalDistribution,
};

Chart.Plot(zeroSkewedDistribution, layout)

### Kurtosis

In [None]:
var stdNormalDistribution = Normal.Samples(mean: 0, stddev: 1).Take(100_000);
var leptokurticNormalDistribution = Normal.Samples(mean: 0, stddev: 0.7).Take(100_000);
var platykurticNormalDistribution = ContinuousUniform.Samples(lower: -3, upper: 3).Take(100_000);

var layout = new Layout.Layout
{
    title = "Different Types of Kurtosis",
    showlegend = true,
};

var zeroKurtosis = new XPlot.Plotly.Histogram
{
    x = stdNormalDistribution,
    name = "Zero Kurtosis",

};

var leptokurtic = new XPlot.Plotly.Histogram
{
    x = leptokurticNormalDistribution,
    name = "Leptokurtic - Positive Kurtosis",
};

var platykurtic = new XPlot.Plotly.Histogram
{
    x = platykurticNormalDistribution, 
    name = "Platykurtic - Negative Kurtosis",
};

Chart.Plot(new [] { zeroKurtosis, leptokurtic, platykurtic }, layout)

## Visualization

### Line Chart

In [None]:
var layout = new Layout.Layout
{
    title = "Comparative Line Chart"
};

var x0 = new [] { DateTime.Today.AddDays(-6),  DateTime.Today.AddDays(-5),  DateTime.Today.AddDays(-4),  DateTime.Today.AddDays(-3),  DateTime.Today.AddDays(-2) };
var y0 = ChiSquared.Samples(5).Take(x0.Count());

var scatter0 = new Scatter
{
    x = x0, 
    y = y0
};

var x1 = new [] { DateTime.Today.AddDays(-6),  DateTime.Today.AddDays(-5),  DateTime.Today.AddDays(-4),  DateTime.Today.AddDays(-3),  DateTime.Today.AddDays(-2) };
var y1 = ChiSquared.Samples(5).Take(x0.Count());

var scatter1 = new Scatter
{
    x = x1, 
    y = y1
};

Chart.Plot(new [] { scatter0, scatter1 }, layout)

### Area Chart 

In [None]:
var layout = new Layout.Layout
{
    title = "Comparative Area Chart"
};

var x0 = new [] { DateTime.Today.AddDays(-6),  DateTime.Today.AddDays(-5),  DateTime.Today.AddDays(-4),  DateTime.Today.AddDays(-3),  DateTime.Today.AddDays(-2) };
var y0 = ChiSquared.Samples(5).Take(x0.Count());

var scatter0 = new Scatter
{
    x = x0, 
    y = y0,
    fill = "tozeroy",
    name = "Series 1"
};

var x1 = new [] { DateTime.Today.AddDays(-6),  DateTime.Today.AddDays(-5),  DateTime.Today.AddDays(-4),  DateTime.Today.AddDays(-3),  DateTime.Today.AddDays(-2) };
var y1 = ChiSquared.Samples(5).Take(x0.Count());

var scatter1 = new Scatter
{
    x = x1, 
    y = y1,
    fill = "tonexty",
    name = "Series 2"
};

Chart.Plot(new [] { scatter0, scatter1 }, layout)

### Bubble Chart

In [None]:
var trace1 = new Scatter
{
    x = new [] { 1,2,3,4 },
    y = new [] { 10, 11, 12, 13 },
    name = "Series 1"
    mode = "markers",
    marker = new Marker
    {
        color = "red",
        size = new [] { 12, 22, 32, 42 }
    }
};

var trace2 = new Scatter 
{
    x = new [] { 1,2,3,4 },
    y = new [] { 10, 11, 12, 16 },
    mode = "markers",
    marker = new Marker
    {
        color = "blue",
        size = 18, 
    }
};

Chart.Plot(new [] { trace1, trace2 })

### Bar Chart

##### Unstacked

In [None]:
var layout = new Layout.Layout
{
    title = "Bar Chart Comparing Regressions vs. Improvements vs. Stale Results"
};

List<(string category, double results)> categoricalResults_Experiment1 = new() 
{
    ( "Regressions", 23 ),
    ( "Improvements", 10 ),
    ( "Stale", 5 ),
};

List<(string category, double results)> categoricalResults_Experiment2 = new() 
{
    ( "Regressions", 10 ),
    ( "Improvements", 23 ),
    ( "Stale", 5 ),
};


var barChart_Experiment1 = new Bar
{
    x = categoricalResults_Experiment1.Select(c => c.category),
    y = categoricalResults_Experiment1.Select(c => c.results),
    name = "Experiment 1"
};

var barChart_Experiment2 = new Bar
{
    x = categoricalResults_Experiment2.Select(c => c.category),
    y = categoricalResults_Experiment2.Select(c => c.results),
    name = "Experiment 2"
};


Chart.Plot(new [] { barChart_Experiment1, barChart_Experiment2 }, layout)


#### Stacked

In [None]:
var layout = new Layout.Layout
{
    title = "Bar Chart Comparing Regressions vs. Improvements vs. Stale Results - Stacked",
    barmode = "stack"
};

List<(string category, double results)> categoricalResults_Experiment1 = new() 
{
    ( "Regressions", 23 ),
    ( "Improvements", 10 ),
    ( "Stale", 5 ),
};

List<(string category, double results)> categoricalResults_Experiment2 = new() 
{
    ( "Regressions", 10 ),
    ( "Improvements", 23 ),
    ( "Stale", 5 ),
};


var barChart_Experiment1 = new Bar
{
    x = categoricalResults_Experiment1.Select(c => c.category),
    y = categoricalResults_Experiment1.Select(c => c.results),
    name = "Experiment 1"
};

var barChart_Experiment2 = new Bar
{
    x = categoricalResults_Experiment2.Select(c => c.category),
    y = categoricalResults_Experiment2.Select(c => c.results),
    name = "Experiment 2"
};


Chart.Plot(new [] { barChart_Experiment1, barChart_Experiment2 }, layout)


### Scatterplot

In [None]:
var first = Normal.Samples(mean: 0, stddev: 1).Take(100);
var second = Normal.Samples(mean: 0, stddev: 19).Take(100);

var layout = new Layout.Layout
{
    title = "Scatter of 2 different variables"
};

var scatter1 = new Scatter
{
    x = Enumerable.Range(0, 100),
    y = first,
    mode = "markers",
    name = "Scatter - 1"
};

var scatter2 = new Scatter
{
    x = Enumerable.Range(0, 100),
    y = second,
    mode = "markers",
    name = "Scatter - 2"
};

Chart.Plot(new [] { scatter1, scatter2 }, layout)

### Heat Map

In [None]:
var values = new List<List<double>>(5)
{
    new List<double>{ 1, 20, 30 },
    new List<double>{ 20, 1, 60 },
    new List<double>{ 30, 60, 1 },
};

var layout = new Layout.Layout
{
    title = "Example Heat Map"
};

var heatMap = new XPlot.Plotly.Heatmap()
{
    z = values
};

Chart.Plot(heatMap, layout)

In [None]:
#!about

0,1
,.NET Interactive© 2020 Microsoft CorporationVersion: 1.0.345202+1f7cb23c53cee7fedf47418a0b0321090034af16Library version: 1.0.0-beta.22452.2+1f7cb23c53cee7fedf47418a0b0321090034af16Build date: 2022-09-05T21:14:08.9219906Zhttps://github.com/dotnet/interactive
