# Imports

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import scipy.io as sio

# Load the data

In [2]:
file = 'hwkdataNEW.mat'
data_dict = sio.loadmat(file)
print(data_dict.keys())

dict_keys(['__header__', '__version__', '__globals__', 'x', 'y'])


Extract the parts we want and examine them

In [3]:
x = data_dict['x']
y = data_dict['y']

In [4]:
x.shape

(247, 14)

In [5]:
y.shape

(247, 1)

We want to put this all into the `pandas` data format

In [6]:
column_names = ['Age', 'Weight', 'Height', 'Adiposity Index', 
                'Neck Circumference', 'Chest Circumference', 
                'Abdomen Circumference', 'Hip Circumference',
                'Thigh Circumference', 'Knee Circumference',
                'Ankle Circumference', 'Extended Bicep Circumference',
                'Forearm Circumference', 'Wrist Circumference',
                'Body Fat Percentage']

In [7]:
data = pd.DataFrame(x, columns=column_names[:-1]) # Last column contains y

In [8]:
data[column_names[-1]] = y

In [9]:
data.head()

Unnamed: 0,Age,Weight,Height,Adiposity Index,Neck Circumference,Chest Circumference,Abdomen Circumference,Hip Circumference,Thigh Circumference,Knee Circumference,Ankle Circumference,Extended Bicep Circumference,Forearm Circumference,Wrist Circumference,Body Fat Percentage
0,23.0,154.25,67.75,23.7,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1,12.6
1,22.0,173.25,72.25,23.4,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2,6.9
2,22.0,154.0,66.25,24.7,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6,24.6
3,26.0,184.75,72.25,24.9,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2,10.9
4,24.0,184.25,71.25,25.6,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7,27.8


Data is now a pandas DataFrame that contains the information we need

# Sort into Train, Test, and Validation

Since the rows of data should be indpendent and uncorrelated, I have chosen to simply shuffle them then tag them as belonging to one of the named data sets (`train`, `test`, or `validation`). I will use the `.sample` method for this, which randomly selects the fraction provided as an argument.

In [58]:
train = data.sample(frac=0.7)
valid = data.sample(frac=0.15)
test = data.sample(frac=0.15)

In [59]:
train.size / 15

173.0

In [60]:
valid.size / 15

37.0

In [61]:
data.size  / 15

247.0

In [62]:
test.size / 15

37.0

In [63]:
train.head()

Unnamed: 0,Age,Weight,Height,Adiposity Index,Neck Circumference,Chest Circumference,Abdomen Circumference,Hip Circumference,Thigh Circumference,Knee Circumference,Ankle Circumference,Extended Bicep Circumference,Forearm Circumference,Wrist Circumference,Body Fat Percentage
192,42.0,168.0,71.5,23.1,36.5,92.0,89.7,101.0,62.3,38.0,22.3,30.8,27.8,16.9,17.6
240,68.0,155.5,69.25,22.8,36.3,97.4,84.3,94.4,54.3,37.5,22.6,29.2,27.3,18.5,15.3
86,48.0,176.0,73.0,23.3,36.7,96.7,86.5,98.3,60.4,39.9,24.4,28.8,29.6,18.7,14.3
19,33.0,211.75,73.5,27.6,40.0,106.2,100.5,109.0,65.8,40.6,24.0,37.1,30.1,18.2,16.5
110,48.0,175.25,71.75,24.0,38.0,100.7,92.4,97.5,59.3,38.1,21.8,31.8,27.3,17.5,25.9


# Ensure that the training dataset contains the rows with the highest value of each column

In [64]:
rows_appended = 0
for col in column_names:
    try:
        train = train.append(data.loc[data[col].idxmax()], verify_integrity=True)
        rows_appended += 1
    except ValueError:
        print('This max-row was in the training data already')
    try:
        train = train.append(data.loc[data[col].idxmin()], verify_integrity=True)
        rows_appended += 1
    except ValueError:
        print('This min-row was in the training data already')
print(f'Appended {rows_appended} rows')

This min-row was in the training data already
This min-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training data already
This min-row was in the training data already
This max-row was in the training d

In [65]:
train.size / 15

179.0