## Data analysis for the bachelor thesis: using the scattering transform for image classification by Marius Hobbhahn (2019)

1. Baseline tests: 3 Datasets(VOC, Kitti, toy_data) with additional parameters Batchnorm, Augmentations, Pretrained. Dependant variable is AP = average precision

In [11]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf  

In [12]:
#load and show
baselinedf = pd.read_csv('baselines.csv', delimiter=',')
baselinedf

Unnamed: 0,VOC,Toy_data,Kitti,Augmentations,Batchnorm,Pretrained,Accuracy
0,1,0,0,1,1,1,0.6250
1,1,0,0,1,1,1,0.6205
2,1,0,0,1,1,1,0.6062
3,1,0,0,1,1,1,0.6164
4,1,0,0,1,1,1,0.6271
5,1,0,0,0,1,1,0.3470
6,1,0,0,0,1,1,0.3411
7,1,0,0,0,1,1,0.3686
8,1,0,0,0,1,1,0.3191
9,1,0,0,0,1,1,0.3287


In [13]:
#data cleaning: throw out the outliers

baseline_df_clean = baselinedf.drop([20, 36, 38, 44, 53, 57])
baseline_df_clean

Unnamed: 0,VOC,Toy_data,Kitti,Augmentations,Batchnorm,Pretrained,Accuracy
0,1,0,0,1,1,1,0.6250
1,1,0,0,1,1,1,0.6205
2,1,0,0,1,1,1,0.6062
3,1,0,0,1,1,1,0.6164
4,1,0,0,1,1,1,0.6271
5,1,0,0,0,1,1,0.3470
6,1,0,0,0,1,1,0.3411
7,1,0,0,0,1,1,0.3686
8,1,0,0,0,1,1,0.3191
9,1,0,0,0,1,1,0.3287


In [14]:
#fit the model with binomial and logistic link function
baselinefit = sm.formula.glm( formula='Accuracy~VOC + Toy_data + Kitti + Augmentations + Batchnorm + Pretrained', 
                         data=baseline_df_clean, 
                         family=sm.families.Binomial(link=sm.families.links.logit) 
                       ).fit()
print(baselinefit.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Accuracy   No. Observations:                   88
Model:                            GLM   Df Residuals:                       82
Model Family:                Binomial   Df Model:                            5
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -29.873
Date:                Mon, 03 Jun 2019   Deviance:                       2.4687
Time:                        22:30:25   Pearson chi2:                     2.45
No. Iterations:                     8   Covariance Type:             nonrobust
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -1.2916      0.447     -2.892      0.004      -2.167      -0.416
VOC               0.1038      0.369      0.

In [15]:
#create a table with the respective means and standard deviations for the categories
df = baseline_df_clean
datasets = ['VOC', 'Kitti', 'Toy_data']
augmentations = [0, 1]
batchnorms = [0, 1]
pretrained = [0, 1]
data = []
#df.loc[(df['VOC'] == 1) & (df['Augmentations'] == 1) & (df['Batchnorm'] == 1) & (df['Pretrained'] == 1), 'Accuracy']
for d in datasets:
    for a in augmentations:
        for b in batchnorms:
            for p in pretrained:
                values = df.loc[(df[d] == 1) & (df['Augmentations'] == a) & (df['Batchnorm'] == b) & (df['Pretrained'] == p), 'Accuracy']
                std_dev = np.std(values)
                mean = np.mean(values)
                print("dataset: {} \t augmentation: {} \t batchnorm: {} \t pretrained: {} \t mean: {:.03f} \t std_dev: {:.03f}".format(d,a,b,p, mean, std_dev))
                data.append([d,a,b,p, np.around(mean, 3), np.around(std_dev, 3)])
        
columns = ['Dataset', 'Augmentations', 'Batchnorm', 'Pretrained', 'Mean', 'Std_dev']
final_df = pd.DataFrame(data, columns=columns)
print(final_df.to_latex(index=False))


dataset: VOC 	 augmentation: 0 	 batchnorm: 0 	 pretrained: 0 	 mean: 0.108 	 std_dev: 0.008
dataset: VOC 	 augmentation: 0 	 batchnorm: 0 	 pretrained: 1 	 mean: 0.363 	 std_dev: 0.055
dataset: VOC 	 augmentation: 0 	 batchnorm: 1 	 pretrained: 0 	 mean: 0.329 	 std_dev: 0.041
dataset: VOC 	 augmentation: 0 	 batchnorm: 1 	 pretrained: 1 	 mean: 0.341 	 std_dev: 0.017
dataset: VOC 	 augmentation: 1 	 batchnorm: 0 	 pretrained: 0 	 mean: 0.364 	 std_dev: 0.025
dataset: VOC 	 augmentation: 1 	 batchnorm: 0 	 pretrained: 1 	 mean: 0.630 	 std_dev: 0.003
dataset: VOC 	 augmentation: 1 	 batchnorm: 1 	 pretrained: 0 	 mean: 0.568 	 std_dev: 0.002
dataset: VOC 	 augmentation: 1 	 batchnorm: 1 	 pretrained: 1 	 mean: 0.619 	 std_dev: 0.007
dataset: Kitti 	 augmentation: 0 	 batchnorm: 0 	 pretrained: 0 	 mean: 0.027 	 std_dev: 0.024
dataset: Kitti 	 augmentation: 0 	 batchnorm: 0 	 pretrained: 1 	 mean: 0.032 	 std_dev: 0.009
dataset: Kitti 	 augmentation: 0 	 batchnorm: 1 	 pretrained: 0 	 

# 2. Tests for the invariance datasets

In [16]:
#load and show
invariantdf = pd.read_csv('invariant_data.csv', delimiter=',')
invariantdf

Unnamed: 0,Deformation_data,Rotation_data,Translation_data,Scale_data,Pretrained,Accuracy
0,1,0,0,0,1,0.9237
1,1,0,0,0,1,0.8462
2,1,0,0,0,1,0.8947
3,1,0,0,0,1,0.9047
4,1,0,0,0,1,0.9088
5,1,0,0,0,0,0.9237
6,1,0,0,0,0,0.9302
7,1,0,0,0,0,0.9267
8,1,0,0,0,0,0.932
9,1,0,0,0,0,0.9293


In [17]:
#fit the model with binomial and logistic link function
invariantfit = sm.formula.glm( formula='Accuracy~Deformation_data  + Rotation_data + Scale_data + Translation_data + Pretrained', 
                         data=invariantdf, 
                         family=sm.families.Binomial(link=sm.families.links.logit) 
                       ).fit()
print(invariantfit.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:               Accuracy   No. Observations:                   42
Model:                            GLM   Df Residuals:                       37
Model Family:                Binomial   Df Model:                            4
Link Function:                  logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -11.790
Date:                Mon, 03 Jun 2019   Deviance:                     0.086250
Time:                        22:30:25   Pearson chi2:                   0.0930
No. Iterations:                     9   Covariance Type:             nonrobust
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.5830      1.724     -0.338      0.735      -3.962       2.796
Deformation_data     2.9682      1

In [21]:
#create a table with the respective means and standard deviations for the categories
df = invariantdf
datasets = ['Deformation_data', 'Rotation_data', 'Translation_data', 'Scale_data']
pretrained = [0, 1]
data = []
#df.loc[(df['VOC'] == 1) & (df['Augmentations'] == 1) & (df['Batchnorm'] == 1) & (df['Pretrained'] == 1), 'Accuracy']
for d in datasets:
    for p in pretrained:
        values = df.loc[(df[d] == 1) & (df['Pretrained'] == p), 'Accuracy']
        std_dev = np.std(values)
        mean = np.mean(values)
        print("dataset: {} \t pretrained: {} \t mean: {:.03f} \t std_dev: {:.03f}".format(d,p, mean, std_dev))
        data.append([d,p, np.around(mean, 3), np.around(std_dev, 3)])

columns = ['Dataset', 'Pretrained', 'Mean', 'Std_dev']
final_df = pd.DataFrame(data, columns=columns)
print(final_df.to_latex(index=False))

dataset: Deformation_data 	 pretrained: 0 	 mean: 0.928 	 std_dev: 0.003
dataset: Deformation_data 	 pretrained: 1 	 mean: 0.896 	 std_dev: 0.026
dataset: Rotation_data 	 pretrained: 0 	 mean: 0.635 	 std_dev: 0.010
dataset: Rotation_data 	 pretrained: 1 	 mean: 0.622 	 std_dev: 0.026
dataset: Translation_data 	 pretrained: 0 	 mean: 0.001 	 std_dev: 0.001
dataset: Translation_data 	 pretrained: 1 	 mean: 0.002 	 std_dev: 0.001
dataset: Scale_data 	 pretrained: 0 	 mean: 0.644 	 std_dev: 0.006
dataset: Scale_data 	 pretrained: 1 	 mean: 0.637 	 std_dev: 0.004
\begin{tabular}{lrrr}
\toprule
          Dataset &  Pretrained &   Mean &  Std\_dev \\
\midrule
 Deformation\_data &           0 &  0.928 &    0.003 \\
 Deformation\_data &           1 &  0.896 &    0.026 \\
    Rotation\_data &           0 &  0.635 &    0.010 \\
    Rotation\_data &           1 &  0.622 &    0.026 \\
 Translation\_data &           0 &  0.001 &    0.001 \\
 Translation\_data &           1 &  0.002 &    0.001 \\
  

# 3 sequential scattering

# 4 parallel scattering

# Timing experiments

In [28]:
#TIMINGS:

scattering_parallel = [1.4894, 1.4930, 1.4955, 1.4959, 1.4999, 1.4968, 1.5008, 1.4978, 1.4987, 1.4994, 
                       1.4981, 1.4968, 1.4971, 1.4998, 1.4970, 1.4986, 1.5014, 1.4976, 1.4985, 1.4984,
                       1.4964, 1.4981, 1.4962, 1.5008, 1.4969, 1.5001, 1.4992, 1.4984, 1.4989, 1.5001,
                       1.4980, 1.5013, 1.4993, 1.5012, 1.4980, 1.4980, 1.5017, 1.4977, 1.5017, 1.4978,
                       1.4978, 1.5020, 1.4967, 1.4982, 1.4986, 1.4984, 1.4991, 1.4989, 1.4994, 1.5000, 
                       1.5002, 1.4986, 1.4983, 1.4988, 1.4998, 1.4976, 1.4969, 1.5012, 1.4995, 1.4972, 
                       1.4994, 1.5017, 1.4997, 1.5002, 1.5010, 1.4955, 1.4982, 1.4990, 1.5000, 1.5004, 
                       1.5002, 1.4976, 1.4981, 1.4997, 1.4970, 1.4990, 1.5003, 1.4987, 1.4981, 1.4988,
                       1.4998, 1.4991, 1.5000, 1.4999, 1.4992, 1.4980, 1.4980, 1.4957, 1.4994, 1.5017,
                       1.4999, 1.4958, 1.4982, 1.5011, 1.4983, 1.4989, 1.4981, 1.4966, 1.4990, 1.5009]
print("scattering parallel: mean={:.03f} \t std={:.03f}".format(np.mean(scattering_parallel), np.std(scattering_parallel)))

scattering_sequential = [0.1739, 0.1778, 0.1765, 0.2032, 0.1740, 0.1788, 0.1766, 0.1870, 0.1756, 0.1738,
                         0.1757, 0.1747, 0.1854, 0.1746, 0.1746, 0.1824, 0.1770, 0.1766, 0.1772, 0.1769,
                         0.1765, 0.1750, 0.1753, 0.1858, 0.1775, 0.1766, 0.1820, 0.1804, 0.1758, 0.1842,
                         0.1750, 0.1772, 0.1749, 0.1756, 0.1757, 0.1789, 0.1791, 0.1748, 0.1822, 0.1766,
                         0.1790, 0.1806, 0.1767, 0.1770, 0.1764, 0.1757, 0.1745, 0.1780, 0.1800, 0.1794,
                         0.1757, 0.1748, 0.1850, 0.1775, 0.1771, 0.1754, 0.1769, 0.1765, 0.1781, 0.1810,
                         0.1798, 0.1755, 0.1761, 0.1856, 0.1749, 0.1859, 0.1752, 0.1761, 0.1758, 0.1840,
                         0.1844, 0.1824, 0.1775, 0.1762, 0.1877, 0.1763, 0.1754, 0.1876, 0.1834, 0.1761, 
                         0.1750, 0.1742, 0.1745, 0.1749, 0.1828, 0.1778, 0.1775, 0.1863, 0.1771, 0.1805,
                         0.1762, 0.1782, 0.1857, 0.1789, 0.1769, 0.1760, 0.1788, 0.1764, 0.1788, 0.1803]

print("sequential scattering: mean={:.03f} \t std={:.03f}".format(np.mean(scattering_sequential), np.std(scattering_sequential)))

normal = [0.2331, 0.2343, 0.2323, 0.2359, 0.2340, 0.2328, 0.2344, 0.2329, 0.2333, 0.2308,
          0.2332, 0.2343, 0.2343, 0.2346, 0.2313, 0.2637, 0.2352, 0.2360, 0.2340, 0.2356,
          0.2357, 0.2328, 0.2355, 0.2358, 0.2339, 0.2362, 0.2362, 0.2333, 0.2362, 0.2355,
          0.2475, 0.2381, 0.2346, 0.2347, 0.2335, 0.2362, 0.2342, 0.2338, 0.2360, 0.2340,
          0.2340, 0.2358, 0.2359, 0.2340, 0.2357, 0.2357, 0.2323, 0.2537, 0.2353, 0.2370,
          0.2348, 0.2362, 0.2355, 0.2340, 0.2357, 0.2363, 0.2338, 0.2346, 0.2362, 0.2338,
          0.2408, 0.2332, 0.2363, 0.2395, 0.2359, 0.2361, 0.2343, 0.2364, 0.2354, 0.2351,
          0.2359, 0.2356, 0.2350, 0.2318, 0.2368, 0.2339, 0.2321, 0.2356, 0.2359, 0.2398,
          0.2361, 0.2367, 0.2345, 0.2349, 0.2362, 0.2346, 0.2328, 0.2332, 0.2344, 0.2358,
          0.2347, 0.2341, 0.2329, 0.2366, 0.2337, 0.2445, 0.2360, 0.2363, 0.2344, 0.2452]

print("scattering parallel: mean={:.03f} \t std={:.03f}".format(np.mean(normal), np.std(normal)))


scattering parallel: mean=1.499 	 std=0.002
sequential scattering: mean=0.178 	 std=0.004
scattering parallel: mean=0.236 	 std=0.004


# small data and short training experiments