In [1]:
import glob
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
from pandas.plotting import scatter_matrix


In [10]:
# See data/REAMD.md for longer names/details.
columns = ['x', 'y', 'z', 'label']
labels = ['Computer', 'Moving', 'Standing', 'Walking', 'Stairs',
          'Walking + talking', 'Standing + talking']
labels_map = {
    1: 'Computer',
    2: 'Moving',
    3: 'Standing',
    4: 'Walking',
    5: 'Stairs',
    6: 'Walking + talking',
    7: 'Standing + talking'
}


ad = DataFrame()
# find all csv files
files = glob.glob('data/*.csv')
# use only 1 file:
#files = ['data/1.csv']
# concatenate all 15 data files
# usecols is used to skip reading the 'id' column, which has errors
for file in files:
    person_data = pd.read_csv(file, sep=',', header=None, names=columns,
                 usecols=[1,2,3,4])
    id = file.split('data/')[1].split('.csv')[0]
    person_data['person'] = id
    person_data = person_data.loc[(person_data['label'] >= 1) & (person_data['label'] <= 7), :]

    ad = pd.concat([ad, person_data])

# TODO: put some reasonable values here
x_min_max = (1455, 2356)
y_min_max = (1697, 2713)
z_min_max = (1644, 2739)

# From this we can see the data is unbalanced, 'moving' has 928 data points,
# while 'standing + walking' has 83748. When using knn, the feature with most
# data points will be favoured.
for column in columns:

    if column == 'label':

        print(ad[column].value_counts())
    if column == 'x':
        # TODO detect anomalies
        #print(ad[column].sort_values().tail(40))
        print(column, 'min max:', ad[column].min(), ad[column].max())
    if column == 'y':
        print(column, 'min max:', ad[column].min(), ad[column].max())
    if column == 'z':
        print(column, 'min max:', ad[column].min(), ad[column].max())

ad['label'] = ad['label'].map(labels_map)

accs = ['x', 'y', 'z']

label_color = {
    'Computer': 'red',
    'Moving': 'green',
    'Standing': 'black',
    'Walking': 'yellow',
    'Stairs': 'cyan',
    'Walking + talking': 'orange',
    'Standing + talking': 'purple' ,
}

# add a color to each class (label)
ad['color'] = ad['label'].map(label_color)


x min max: 282 3828
y min max: 2 4095
z min max: 1 4095
1    608667
7    593563
4    357064
3    216737
5     51498
2     47878
6     47770
Name: label, dtype: int64


In [None]:
scatter_matrix(ad, alpha=.2, figsize=(16,16), diagonal='hist')
# plt.savefig('scattermat.png')

In [13]:
outs = ad.loc[(ad['z'] > 3800) | (ad['y'] > 3800), :]

In [None]:
def explore_acc(feature='x'):
    feat = ad[feature]
    print(f"{feature} min: {ad[feature].min()}, max: {ad[feature].max()}")
    # show statistics
    print(feat.value_counts(bins=10, normalize=True, sort=False))
    # plot distribution
    print(feat.plot.hist(bins=100))
    plt.xlabel(feature)

explore_acc()


In [None]:
def explore_label():
    feat = ad['label']

    # Show statistics and distribution
    print(feat.value_counts(normalize=True, sort=True))
    feat.value_counts(normalize=True, sort=True).plot.pie()

explore_label()

In [None]:
# Explore pair of attributes (columns)
# plot labels against each acceleration
for acc in accs:
    fig, ax = plt.subplots(1, 1)

    for label in labels:
        label_acc = ad.loc[ad['label'] == label, acc]
        label_acc.plot.box()

    plt.title(f"Distribution of {acc} for each class of movement")
    plt.xlabel('Acceleration')
    plt.ylabel('Density')
    plt.legend(labels)
    fig.show()
    #break

In [None]:
for label in labels:
    label_acc = ad.loc[ad['label'] == label, accs]
    fig, ax = plt.subplots()
    label_acc.boxplot(grid=False)
    plt.show()


In [None]:
# Hypothesis: I expect to see some relationship between accelerations, eg.
# walking in stairs would have both x acc and y acc higher, but walking flat would
# have little change in y, but maybe more in z.
# Result: I can see some groupings/clusters forming, but I cannot see which
# classes are forming these clusters. Need to color code
for acc in accs:
    for acc2 in accs:
        if acc == acc2: continue
        ad.plot.scatter(x=acc, y=acc2)
        plt.title(f"{acc} vs {acc2}")
        plt.xlabel(f"{acc}")
        plt.ylabel(f"{acc2}")
        plt.show()

In [None]:
# TASK: Explore the relationship between all pairs of attributes
# Compare class label, vs accelerations
# Hypothesis: This will show me how they are clustered, the task which I want to do
# Result:

label_color = {
    'Computer': 'grey',
    'Moving': 'green',
    'Standing': 'black',
    'Walking': 'yellow',
    'Stairs': 'cyan',
    'Walking + talking': 'orange',
    'Standing + talking': 'purple' ,
}
# observations: black, purple totally overlaps.
# Red differs the most in all dimensions, generally lower values

# add a color to each class (label)
ad['color'] = ad['label'].map(label_color)



for acc in accs:
    for acc2 in accs:
        if acc == acc2: continue
        ad.plot.scatter(x=acc, y=acc2, c=ad['color'], alpha=0.4)
        plt.title(f"{acc} vs {acc2} by labels")
        plt.xlabel(f"{acc}")
        plt.ylabel(f"{acc2}")
        # TODO: add legend
        plt.show()

In [None]:
# 3D plot
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.scatter(ad['x'], ad['y'], ad['z'], c=ad['color'], alpha=.2)
