In [1]:
RED  = '#C82506'
BLUE = '#0365C0'
GREEN = '#00882B'

import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

def clean(ax):
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)


## Classification

In [2]:
female = pd.read_csv('./ANSUR II FEMALE Public.csv')
male = pd.read_csv('./ANSUR II MALE Public.csv')

In [3]:
# Print available features
list(female)

['SubjectId',
 'abdominalextensiondepthsitting',
 'acromialheight',
 'acromionradialelength',
 'anklecircumference',
 'axillaheight',
 'balloffootcircumference',
 'balloffootlength',
 'biacromialbreadth',
 'bicepscircumferenceflexed',
 'bicristalbreadth',
 'bideltoidbreadth',
 'bimalleolarbreadth',
 'bitragionchinarc',
 'bitragionsubmandibulararc',
 'bizygomaticbreadth',
 'buttockcircumference',
 'buttockdepth',
 'buttockheight',
 'buttockkneelength',
 'buttockpopliteallength',
 'calfcircumference',
 'cervicaleheight',
 'chestbreadth',
 'chestcircumference',
 'chestdepth',
 'chestheight',
 'crotchheight',
 'crotchlengthomphalion',
 'crotchlengthposterioromphalion',
 'earbreadth',
 'earlength',
 'earprotrusion',
 'elbowrestheight',
 'eyeheightsitting',
 'footbreadthhorizontal',
 'footlength',
 'forearmcenterofgriplength',
 'forearmcircumferenceflexed',
 'forearmforearmbreadth',
 'forearmhandlength',
 'functionalleglength',
 'handbreadth',
 'handcircumference',
 'handlength',
 'headbread

In [None]:
male.stature

In [None]:
female_sel = female[['Age', 'Heightin', 'Gender']]
male_sel = male[['Age', 'Heightin', 'Gender']]

In [None]:
# get some example data

samplem = male[:10][['stature', 'interscyeii', 'Gender']]
samplef = female[:10][['stature', 'interscyeii', 'Gender']]

sample = pd.concat([samplem, samplef])
sample['stature'] = (sample['stature'] * 0.1).round().astype(int)
sample['interscyeii'] = (sample['interscyeii'] * 0.1).round().astype(int)
sample['Gender'] = sample['Gender'].str.lower()


# shuffle the rows
sample.sample(frac=1)

In [None]:
# Get a bigger dataset to build a classifier on
# (not too big, so it's visually understandable)

samplem = male[:300][['stature', 'interscyeii', 'Gender']]
samplef = female[:300][['stature', 'interscyeii', 'Gender']]

sample = pd.concat([samplem, samplef])
sample['stature'] = (sample['stature'] * 0.1)
sample['interscyeii'] = (sample['interscyeii'] * 0.1)
sample['Gender'] = sample['Gender'].astype('category').cat.codes # convert string labels to type "category"
                                                                 # and convert those to integers
 
# shuffle the rows
data = sample.sample(frac=1)

x = data[['stature', 'interscyeii']].as_matrix()
x_train = x[:150, :]
x_test  = x[150:, :]

y = data.Gender.values # convert the categories to integers
y_train = y[:150]
y_test = y[150:]

In [None]:
plt.figure(figsize=(6, 4.5))

tr = data[:150]
trm = tr[tr.Gender == 1]
trf = tr[tr.Gender == 0]

s = 16
a = 1.0
plt.scatter(trm.stature.values, trm.interscyeii.values, label='male', s=s, alpha=a, color=BLUE)
plt.scatter(trf.stature.values, trf.interscyeii.values,  label='female', s=s, alpha=a, color=RED)

plt.xlabel('height (cm)')
plt.ylabel('distance between shoulders (cm)')
plt.legend()

clean(plt.gca())

plt.savefig('data.pdf') # get a pdf of the image

In [None]:
import matplotlib
from mlxtend.plotting import plot_decision_regions

from sklearn.svm import SVC
import numpy as np
import mlxtend

# This is a linear classifier, it just has a fancy name
cls = SVC(kernel='linear')
cls.fit(x_train, y_train)

# plot the decision boundary
plt.figure(figsize=(6, 4.5))

ax = plot_decision_regions(x_train, y_train, clf=cls, res=0.02, legend=None, markers='oo', colors=RED+','+BLUE);

for obj in ax.findobj():
    if isinstance(obj, matplotlib.collections.PathCollection):
        obj.set_linewidth(0)

plt.savefig('linear-boundary.pdf')

In [None]:
from sklearn.tree import DecisionTreeClassifier

# This is a linear classifier, it just has a fancy name
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

# plot the decision boundary
plt.figure(figsize=(6, 4.5))

ax = plot_decision_regions(x_train, y_train, clf=tree, res=0.02, legend=None, markers='oo', colors=RED+','+BLUE);

for obj in ax.findobj():
    if isinstance(obj, matplotlib.collections.PathCollection):
        obj.set_linewidth(0)

plt.savefig('tree-boundary.pdf')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# This is a linear classifier, it just has a fancy name
knn = KNeighborsClassifier(7)
knn.fit(x_train, y_train)

# plot the decision boundary
plt.figure(figsize=(6, 4.5))

ax = plot_decision_regions(x_train, y_train, clf=knn, res=0.02, legend=None, markers='oo', colors=RED+','+BLUE);

for obj in ax.findobj():
    if isinstance(obj, matplotlib.collections.PathCollection):
        obj.set_linewidth(0)

plt.savefig('knn-boundary.pdf')

## Regression

In [None]:
# get some example data

sample = male[:20][['stature', 'trochanterionheight']]

sample['stature'] = (sample['stature'] * 0.1).round().astype(int)
sample['trochanterionheight'] = (sample['trochanterionheight'] * 0.1).round().astype(int)

# shuffle the rows
sample.sample(frac=1)

In [None]:
# bigger example data

rdata = male[:300][['stature', 'trochanterionheight']]

rdata['stature'] = rdata['stature'] * 0.1
rdata['trochanterionheight'] = rdata['trochanterionheight'] * 0.1

rdata = rdata.sample(frac=1)

rx_train = rdata[:150].stature.values.reshape(-1, 1)
rx_test = rdata[150:].stature.values.reshape(-1, 1)

ry_train = rdata[:150].trochanterionheight.values
ry_test = rdata[150:].trochanterionheight.values

plt.figure(figsize=(6, 4.5))

s = 16
a = 0.8
plt.scatter(rx_train, ry_train, s=s, alpha=a, color='k', linewidth=0)

plt.xlabel('height (cm)')
plt.ylabel('leg height (cm)')

clean(plt.gca())

plt.savefig('regression-data.pdf')

In [None]:
alpha=0.5

In [None]:
# Fit a linear model
from sklearn.linear_model import LinearRegression

# Fit the model
linear = LinearRegression()
linear.fit(rx_train, ry_train)

# Plot the model
plt.figure(figsize=(6, 4.5))

plt.scatter(rx_train, ry_train, s=s, alpha=alpha, color='k', linewidth=0)

# Get 500 equally spaced input points
xlin = np.linspace(150, 195, 500).reshape(-1, 1)

plt.plot(xlin, linear.predict(xlin), color=BLUE)

clean(plt.gca())

plt.savefig('linear-regression.pdf')

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
tree.fit(rx_train, ry_train)

# Plot the model
plt.figure(figsize=(6, 4.5))

plt.scatter(rx_train, ry_train, s=s, alpha=alpha, color='k', linewidth=0)

# Get 500 equally spaced input points
xlin = np.linspace(150, 195, 500).reshape(-1, 1)

plt.plot(xlin, tree.predict(xlin), color=RED)

clean(plt.gca())

plt.savefig('tree-regression.pdf')

In [None]:
from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(7)
knn.fit(rx_train, ry_train)

# Plot the model
plt.figure(figsize=(6, 4.5))

plt.scatter(rx_train, ry_train, s=s, alpha=alpha, color='k', linewidth=0)

# Get 500 equally spaced input points
xlin = np.linspace(150, 195, 500).reshape(-1, 1)

plt.plot(xlin, knn.predict(xlin), color=GREEN)

clean(plt.gca())

plt.savefig('knn-regression.pdf')
