In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import bootstrap

In [2]:
df_headers = ['activity', 'dataset']
for i in range(1, 7):
    df_headers.append('min-' + str(i))
    df_headers.append('max-' + str(i))
    df_headers.append('mean-' + str(i))
    df_headers.append('median-' + str(i))
    df_headers.append('std-' + str(i))
    df_headers.append('1quart-' + str(i))
    df_headers.append('3quart-' + str(i))

# print(df_headers)

# Question (1) (a) (b) (c) (i) (ii)

In [3]:
df = pd.DataFrame(columns=df_headers)

for root, dirs, files in os.walk('../data/ARem'):
    for filename in files:
        if 'csv' in filename:
            df_row = []
            activity = root.replace('../data/ARem/', '') # activity
            dataset = filename.replace('.csv', '') # dataset
            df_row.extend((activity, dataset))
            # print(df_row)
            
            seperator = ','
            if (activity == 'bending2') and (dataset == 'dataset4'): seperator = ' '
                
            path = os.path.join(root, filename)
            df_instance = pd.read_csv(path, skiprows=5, header=None, sep=seperator)
            for i in range(1, 7):
                df_row.append(df_instance.min()[i])
                df_row.append(df_instance.max()[i])
                df_row.append(df_instance.mean()[i])
                df_row.append(df_instance.median()[i])
                df_row.append(df_instance.std()[i])
                df_row.append(df_instance.quantile(0.25)[i])
                df_row.append(df_instance.quantile(0.75)[i])
            df.loc[len(df.index)] = df_row

In [4]:
df.head()

Unnamed: 0,activity,dataset,min-1,max-1,mean-1,median-1,std-1,1quart-1,3quart-1,min-2,...,std-5,1quart-5,3quart-5,min-6,max-6,mean-6,median-6,std-6,1quart-6,3quart-6
0,bending1,dataset7,36.25,48.0,43.969125,44.5,1.618364,43.31,44.67,0.0,...,3.318301,20.5,23.75,0.0,2.96,0.555313,0.49,0.487826,0.0,0.83
1,bending1,dataset6,37.0,48.0,43.454958,43.25,1.386098,42.5,45.0,0.0,...,2.488862,22.25,24.0,0.0,5.26,0.679646,0.5,0.622534,0.43,0.87
2,bending1,dataset4,33.0,47.75,42.179813,43.5,3.670666,39.15,45.0,0.0,...,3.849448,30.4575,36.33,0.0,2.18,0.613521,0.5,0.524317,0.0,1.0
3,bending1,dataset5,33.0,45.75,41.678063,41.75,2.24349,41.33,42.75,0.0,...,2.411026,28.4575,31.25,0.0,1.79,0.383292,0.43,0.389164,0.0,0.5
4,bending1,dataset1,37.25,45.0,40.624792,40.5,1.476967,39.25,42.0,0.0,...,2.188449,33.0,36.0,0.0,1.92,0.570583,0.43,0.582915,0.0,1.3


# Question (1) (c) (iii)

In [5]:
df_numeric = df.iloc[:, 2:] # remove activity and dataset columns
df_numeric.std() # standard deviation of each feature (column)

min-1       9.569975
max-1       4.394362
mean-1      5.335718
median-1    5.440054
std-1       1.772153
1quart-1    6.153590
3quart-1    5.138925
min-2       0.000000
max-2       5.062729
mean-2      1.574164
median-2    1.412244
std-2       0.884105
1quart-2    0.946386
3quart-2    2.125266
min-3       2.956462
max-3       4.875137
mean-3      4.008380
median-3    4.036396
std-3       0.946710
1quart-3    4.220658
3quart-3    4.171628
min-4       0.000000
max-4       2.183625
mean-4      1.166114
median-4    1.145586
std-4       0.458242
1quart-4    0.843620
3quart-4    1.552504
min-5       6.124001
max-5       5.741238
mean-5      5.675593
median-5    5.813782
std-5       1.024898
1quart-5    6.096465
3quart-5    5.531720
min-6       0.045838
max-6       2.518921
mean-6      1.154812
median-6    1.086474
std-6       0.517617
1quart-6    0.758584
3quart-6    1.523599
dtype: float64

In [6]:
confidence_interval_ranges = []
for column in df_numeric:
    res = bootstrap((df_numeric[column].values, ), np.std, method='basic', confidence_level=0.9)
    confidence_interval_ranges.append((column, res.confidence_interval[1] - res.confidence_interval[0]))
    print(column, 'CI: [' + str(res.confidence_interval[0]) + ', ' + str(res.confidence_interval[1]) + ']')

min-1 CI: [8.32080437461135, 10.828538992687845]
max-1 CI: [3.465093294793805, 5.41583078124123]
mean-1 CI: [4.757474773020469, 5.928429103448349]
median-1 CI: [4.853531741509988, 6.0516735476200125]
std-1 CI: [1.5882507399826733, 1.9645585197952813]
1quart-1 CI: [5.631942034341408, 6.707997213340449]
3quart-1 CI: [4.415704713155824, 5.927386338173102]
min-2 CI: [0.0, 0.0]
max-2 CI: [4.689749824076415, 5.472495421403169]
mean-2 CI: [1.4334233242220875, 1.741483047895644]
median-2 CI: [1.2725777216081733, 1.580189131315753]
std-2 CI: [0.8208418389999846, 0.9596512247595305]
1quart-2 CI: [0.8535484107481024, 1.0568049947928282]
3quart-2 CI: [1.9444636425634512, 2.3428690704491304]
min-3 CI: [2.785602924330212, 3.136936110602181]
max-3 CI: [4.2682841444978425, 5.546486584265417]
mean-3 CI: [3.49845845630956, 4.562511604439292]
median-3 CI: [3.5077294843224314, 4.605339778883589]
std-3 CI: [0.7663426658536505, 1.1229180870021145]
1quart-3 CI: [3.719243826703896, 4.790569126271538]
3quart-3

# Question (1) (c) (iv)

In [7]:
range_totals = [0, 0, 0, 0, 0, 0, 0]
for i in confidence_interval_ranges:
    if ('min' in i[0]): range_totals[0] += i[1]
    if ('max' in i[0]): range_totals[1] += i[1]
    if ('mean' in i[0]): range_totals[2] += i[1]
    if ('median' in i[0]): range_totals[3] += i[1]
    if ('std' in i[0]): range_totals[4] += i[1]
    if ('1quart' in i[0]): range_totals[5] += i[1]
    if ('3quart' in i[0]): range_totals[6] += i[1]
        
print('min confidence interval range total:', range_totals[0])
print('max confidence interval range total:', range_totals[1])
print('mean confidence interval range total:', range_totals[2])
print('median confidence interval range total:', range_totals[3])
print('std confidence interval range total:', range_totals[4])
print('1quart confidence interval range total:', range_totals[5])
print('3quart confidence interval range total:', range_totals[6])

min confidence interval range total: 6.02108466427155
max confidence interval range total: 6.729089610788531
mean confidence interval range total: 5.107975470208029
median confidence interval range total: 5.29211947698749
std confidence interval range total: 1.407760805581288
1quart confidence interval range total: 4.986117651808476
3quart confidence interval range total: 5.615573930741837


The three most important time-domain features are **mean, standard deviation, and first quartile**. I came to this conclusion by calculating the confidence interval range of each feature for all of the 6 series and choosings the lowest total ranges, giving the time-domain features with the narrowest confidence intervals.

# Question (2)

(a) The extra polynomial term in cubic regression will provide a closer fit to the training data as there are more degrees of freedom. Because of this, I would expect the training RSS for cubic regression to be lower than the training RSS for linear regression.

(b) Using similar reasoning to (a) the extra polynomical term in cubic regression will provide a closer fit to the training data, but also more likely overfit the training data. Given that the true relationship between X and Y is linear, I would expect the test RSS for linear regression to generalize better (and be lower) than the test RSS for cubic regression.

(c) The extra polynomial term in cubic regression will provide a closer fit to the training data as there are more degrees of freedom. Even more so than (a), because the true relationship between X and Y is non-linear. I would expect the training RSS for cubic regression to be lower than the training RSS for linear regression.

(d) There is not enough information to tell. This is because "we don't know how far it is from linear". The test RSS for linear and cubic regression depends on how far from linear the true relationship is. If the true relationship is more linear than cubic, test RSS of linear regression will be lower. If the true relationship is more cubic than linear, test RSS of cubic regression will be lower. We can not say definitively.