# Noise analysis

Analyzing the noise level of the data.

## Load packages:

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

% matplotlib notebook

## Read & clean the data:

In [None]:
files = [['ReferenceData/Sample1Run1.csv', 'ReferenceData/Sample1Run2.csv', 'ReferenceData/Sample1Run3.csv'],
         ['ReferenceData/Sample2Run1.csv', 'ReferenceData/Sample2Run2.csv', 'ReferenceData/Sample2Run3.csv'],
         ['ReferenceData/Sample3Run1.csv', 'ReferenceData/Sample3Run2.csv', 'ReferenceData/Sample3Run3.csv'],
         ['ReferenceData/Sample4Run1.csv', 'ReferenceData/Sample4Run2.csv', 'ReferenceData/Sample4Run3.csv']]

In [None]:
time = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
SiBar = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
AlBar = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]

for i in range(4):
    for j in range(3):
        data = pd.read_csv(files[i][j], delimiter = ';', header = 0, names = ['time', 'SiBar', 'AlBar'])
        data = data.dropna(how = 'any')
        data_np = data.as_matrix()
        time[i][j] = data_np[:, 0]
        SiBar[i][j] = data_np[:, 1]
        AlBar[i][j] = data_np[:, 2]

## Calculate the average concentration:

In [None]:
# Some indices:
index = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
for i in range(4):
    for j in range(3):
        index1 = np.in1d(time[i][j], time[i][(j + 1)%3])
        index2 = np.in1d(time[i][j], time[i][(j + 2)%3])
        index[i][j] = index1*index2    # <=> index1 AND index2

# Average concentrations:
timeAv = [0, 0, 0, 0]
SiAv = [0, 0, 0, 0]
AlAv = [0, 0, 0, 0]
for i in range(4):
    # Extract the common measurement times:
    timeAv[i] = time[i][0][index[i][0]]
    
    #
    SiAv[i] = np.zeros(len(timeAv[i]))
    AlAv[i] = np.zeros(len(timeAv[i]))
    for j in range(3):        
        # Calcuate average Si and Al value:
        SiAv[i] = SiAv[i] + SiBar[i][j][index[i][j]]/3
        AlAv[i] = AlAv[i] + AlBar[i][j][index[i][j]]/3

## Plot silicium data:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], SiBar[i][j], ',', label = 'Run ' + str(j + 1))
    plt.plot(timeAv[i], SiAv[i], linewidth = 2.5, label = 'Average value')
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t)')
    plt.legend(loc = 4)

## Plot Aluminum data:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], AlBar[i][j], ',', label = 'Run ' + str(j + 1))
    plt.plot(timeAv[i], AlAv[i], linewidth = 2.5, label = 'Average value')
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t)')
    plt.legend(loc = 4)

## Plot Silicium and Aluminum data:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], SiBar[i][j], 'r,')
        plt.plot(time[i][j], AlBar[i][j], 'b,')
    plt.plot(timeAv[i], SiAv[i], 'r', linewidth = 2.5)
    plt.plot(timeAv[i], AlAv[i], 'b', linewidth = 2.5)
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t) (red) & Al(t) (blue)')

## Calculate standard deviation for the Silicium concentrations:

In [None]:
varSi = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
stdSi = [0, 0, 0, 0]
for i in range(4):
    for j in range(3):
        varSi[i][j] = np.sum((SiBar[i][j][index[i][j]] - SiAv[i])**2)/len(SiAv[i])
    stdSi[i] = sum(np.sqrt(varSi[i]))/3
stdSiAv = sum(stdSi)/4
stdSiMax = max(stdSi)

print 'Standard deviation of the Silicium concentrations per sample:'
print '\t\t', stdSi
print '=> Average standard deviation of the Silicium concentrations:', stdSiAv
print '=> Maximum standard deviation of the Silicium concentrations:', stdSiMax

## Plot deviation of the mean for the Silicium concentrations:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j][index[i][j]], SiBar[i][j][index[i][j]] - SiAv[i], ',', label = 'Run ' + str(j + 1))
    plt.title('Data from sample ' + str(i + 1) + ': std = ' + str(stdSi[i]))
    plt.xlabel('t')
    plt.ylabel('Si(t)')
    plt.legend(loc = 4)

## Calculate standard deviation for the Aluminum concentrations:

In [None]:
varAl = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
stdAl = [0, 0, 0, 0]
for i in range(4):
    for j in range(3):
        varAl[i][j] = np.sum((AlBar[i][j][index[i][j]] - AlAv[i])**2)/len(AlAv[i])
    stdAl[i] = sum(np.sqrt(varAl[i]))/3
stdAlAv = sum(stdAl)/4
stdAlMax = max(stdAl)

print 'Standard deviation of the Aluminum concentrations per sample:'
print '\t\t', stdAl
print '=> Average standard deviation of the Aluminum concentrations:', stdAlAv
print '=> Maximum standard deviation of the Aluminum concentrations:', stdAlMax

## Plot deviation of the mean for the Aluminum concentrations:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j][index[i][j]], AlBar[i][j][index[i][j]] - AlAv[i], ',', label = 'Run ' + str(j + 1))
    plt.title('Data from sample ' + str(i + 1) + ': std = ' + str(stdAl[i]))
    plt.xlabel('t')
    plt.ylabel('Al(t)')
    plt.legend(loc = 4)

## Calculate average standard deviation:

In [None]:
var = [[0, 0, 0], [0, 0, 0], [0, 0, 0], [0, 0, 0]]
std = [0, 0, 0, 0]
for i in range(4):
    for j in range(3):
        var[i][j] = (np.sum((SiBar[i][j][index[i][j]] - SiAv[i])**2) + np.sum((AlBar[i][j][index[i][j]] - AlAv[i])**2)
                    )/(len(SiAv[i]) + len(AlAv[i]))
    std[i] = sum(np.sqrt(var[i]))/3
stdAv = sum(std)/4
stdMax = max(std)

print 'Standard deviation per sample:', std
print '=> Average standard deviation:', stdAv
print '=> Average standard deviation:', stdMax

## Sumerize the results:

In [None]:
print 'Standard deviation of the Silicium concentrations per sample:'
print '\t\t', stdSi
print '=> Average standard deviation of the Silicium concentrations:', stdSiAv
print '=> Maximum standard deviation of the Silicium concentrations:', stdSiMax
print ''
print 'Standard deviation of the Aluminum concentrations per sample:'
print '\t\t', stdAl
print '=> Average standard deviation of the Aluminum concentrations:', stdAlAv
print '=> Maximum standard deviation of the Aluminum concentrations:', stdAlMax
print ''
print 'Standard deviation per sample:', std
print '=> Average standard deviation:', stdAv
print '=> Maximum standard deviation:', stdMax

# Update

## The noise is not normally distributed:

In [None]:
from scipy.stats.mstats import normaltest

In [None]:
# Choose i = 0, 1, 2 or 3: (This is the number of the sample - 1)
i = 1

# Choose j = 0, 1 or 2: (This is the number of the run - 1 for that sample)
j = 1

noiseSi = SiBar[i][j][index[i][j]] - SiAv[i]
noiseAl = AlBar[i][j][index[i][j]] - AlAv[i]

plt.figure()
plt.hist(noiseSi, bins = 100)
plt.title('Histogram of the noise in the Si data')
plt.show()

plt.figure()
plt.hist(noiseAl, bins = 100)
plt.title('Histogram of the noise in the Al data')
plt.show()

print normaltest(noiseSi)
print normaltest(noiseAl)

## Percentage errors:

In [None]:
noiseSiperc = 100*noiseSi/SiAv[i]
noiseAlperc = 100*noiseAl/AlAv[i]

plt.figure()
plt.hist(noiseSiperc, bins = 100, range = (-25, 25))
plt.title('Histogram of the noise in the Si data')
plt.show()

plt.figure()
plt.hist(noiseAlperc, bins = 100, range = (-25, 25))
plt.title('Histogram of the noise in the Al data')
plt.show()

print normaltest(noiseSiperc)
print normaltest(noiseAlperc)

## Let's try something:

By looking at the histograms of the percentage of the error for some of the samples, it seems that most of the errors fall within 10% of the averaged curve. Can we justify this???

First, lets look at the data again and plotting bands of 10% around the averaged curve (the bands are given by the thick dashed lines).

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], SiBar[i][j], 'r,')
        plt.plot(time[i][j], AlBar[i][j], 'b,')
    plt.plot(timeAv[i], 1.1*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], SiAv[i], 'r', linewidth = 2.5)
    plt.plot(timeAv[i], 0.9*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], 1.1*AlAv[i], 'b--', linewidth = 2.5)
    plt.plot(timeAv[i], AlAv[i], 'b', linewidth = 2.5)
    plt.plot(timeAv[i], 0.9*AlAv[i], 'b--', linewidth = 2.5)
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t) (red) & Al(t) (blue)')

A 10% error doesn't seem to be a bad bound on the error. Only at the very beginning this doesn't seem to hold up. However, to really test this theory, we would need to compare it to more runs of the same sample. Furthermore, the 10% bound we choose is a little bit ad hoc at the moment, so let's find a more robust and mathematically motivated way to get an estimate for the percentage error, which we'll call $\gamma$.

## Idea:

Let's calculate the percentage error in each point and then choose $\gamma$ such that $(1 - \alpha)$ percent of the measurements fall within these bands.

In [None]:
gammaSi = np.zeros((4, 3))
gammaAl = np.zeros((5, 3))

alpha = 0.10        # So 90% of the measurement fall within the percentage error band.

for i in range(4):
    for j in range(3):
        # Errors:
        noiseSi = SiBar[i][j][index[i][j]] - SiAv[i]
        noiseAl = AlBar[i][j][index[i][j]] - AlAv[i]
        
        # Percentage errors:
        noiseSiperc = 100*noiseSi/SiAv[i]
        noiseAlperc = 100*noiseAl/AlAv[i]
        
        # Start estimation for Si:
        for gSi in range(1, 101):
            inband = np.abs(noiseSiperc) < gSi
            conf = float(inband.sum())/inband.size
            if 1 - alpha < conf:
                break
        gammaSi[i, j] = gSi
        
        # Start estimation for Al:
        for gAl in range(1, 101):
            inband = np.abs(noiseAlperc) < gAl
            conf = float(inband.sum())/inband.size
            if 1 - alpha < conf:
                break
        gammaAl[i, j] = gAl

In [None]:
print "For Si we found the following percentage errors:"
for i in range(4):
    print "\t Sample", i + 1, ": "
    for j in range(3):
        print "\t\t Run", j + 1, ": ", gammaSi[i, j]
    print "\t\t Averaged: ", gammaSi[i].sum()/3.0
gammaSiAv = gammaSi.sum()/12.0
gammaSimax = gammaSi.max()
print "The average percentage error over all Si measurements: ", gammaSiAv
print "The maximum percentage error over all Si measurements: ", gammaSimax

print ""

print "For Al we found the following percentage errors:"
for i in range(4):
    print "\t Sample", i + 1, ": "
    for j in range(3):
        print "\t\t Run", j + 1, ": ", gammaAl[i, j]
    print "\t\t Averaged: ", gammaAl[i].sum()/3.0
gammaAlAv = gammaAl.sum()/12.0
gammaAlmax = gammaAl.max()
print "The average percentage error over all Al measurements: ", gammaAlAv
print "The maximum percentage error over all Al measurements: ", gammaAlmax

print ""

gammaAv = (gammaSiAv + gammaAlAv)/2
gammamax = max([gammaSimax, gammaAlmax])

print "The averge percentage error over all measurements: ", gammaAv
print "The maximum percentage error over all measurements: ", gammamax


## Results:

From this analysis we find that $\gamma\approx 6.4167$ for the Si measurments and $\gamma\approx 7.3333$ for the Al measurements. Using these values we get the following bands:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], SiBar[i][j], 'r,')
        plt.plot(time[i][j], AlBar[i][j], 'b,')
    plt.plot(timeAv[i], (1 + gammaSimax/100)*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], SiAv[i], 'r', linewidth = 2.5)
    plt.plot(timeAv[i], (1 - gammaSimax/100)*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], (1 + gammaAlmax/100)*AlAv[i], 'b--', linewidth = 2.5)
    plt.plot(timeAv[i], AlAv[i], 'b', linewidth = 2.5)
    plt.plot(timeAv[i], (1 - gammaAlmax/100)*AlAv[i], 'b--', linewidth = 2.5)
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t) (red) & Al(t) (blue)')

Using the maximum $\gamma = 11.0\%$ over all the measurements we find the following bands:

In [None]:
for i in range(4):
    plt.figure()
    for j in range(3):
        plt.plot(time[i][j], SiBar[i][j], 'r,')
        plt.plot(time[i][j], AlBar[i][j], 'b,')
    plt.plot(timeAv[i], (1 + gammamax/100)*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], SiAv[i], 'r', linewidth = 2.5)
    plt.plot(timeAv[i], (1 - gammamax/100)*SiAv[i], 'r--', linewidth = 2.5)
    plt.plot(timeAv[i], (1 + gammamax/100)*AlAv[i], 'b--', linewidth = 2.5)
    plt.plot(timeAv[i], AlAv[i], 'b', linewidth = 2.5)
    plt.plot(timeAv[i], (1 - gammamax/100)*AlAv[i], 'b--', linewidth = 2.5)
    plt.title('Data from sample ' + str(i + 1))
    plt.xlabel('t')
    plt.ylabel('Si(t) (red) & Al(t) (blue)')

## Extra verification:

We can now see if for a given set of measurements, these measurments fall within the $\gamma$ bound of the average measurements. If $\gamma$ was found for a certain value of $\alpha$, then $(1 - \alpha)$ percent of the measuremnts should fall within this bound.

Let's check this using $\alpha = 10\%$ and thus $\gamma = 11.0\%$.

In [None]:
gamma = 11
alpha = 0.10

for i in range(4):
    for j in range(3):
        #
        print "Case", i, j, ":"
        
        # Errors:
        noiseSi = SiBar[i][j][index[i][j]] - SiAv[i]
        noiseAl = AlBar[i][j][index[i][j]] - AlAv[i]
        
        # Percentage errors:
        noiseSiperc = 100*noiseSi/SiAv[i]
        noiseAlperc = 100*noiseAl/AlAv[i]
        
        # Verification for Si:
        inband = np.abs(noiseSiperc) < gammamax
        conf = float(inband.sum())/inband.size
        print "\t", conf, "percent of the Si data fall within the gamma bound."
        
        # Verification for Al
        inband = np.abs(noiseAlperc) < gammamax
        conf = float(inband.sum())/inband.size
        print "\t", conf, "percent of the Al data fall within the gamma bound."