In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import tree, ensemble
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from IPython.display import Image
from sklearn.decomposition import PCA

import graphviz
import pydotplus
import io
from scipy import misc

import time

%matplotlib inline

In [122]:
df = pd.read_csv('Abalone.txt',
                    header=0,
                    names=['gender', 'length', 'diameter', 'height', 'wholeWeight', 'shuckedWeight', 'visceraWeight', 'shellWeight', 'rings'],
                    )

In [123]:
df.head()

Unnamed: 0,gender,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,shellWeight,rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
3,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
4,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8


This dataset is used for predicting the age of abalone from physical measurements. The age of abalone is determined by cutting the shell through the cone, staining it, and counting the number of rings through a microscope -- a boring and time-consuming task.  Other measurements, which are easier to obtain, are used to predict the age.  Further information, such as weather patterns and location (hence food availability) may be required to solve the problem.

From the original data examples with missing values were removed (the majority having the predicted value missing), and the ranges of the continuous values have been scaled for use with an ANN (by dividing by 200).

Data comes from an original (non-machine-learning) study:

Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and Wes B Ford (1994) "The Population Biology of Abalone (_Haliotis_	species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North	Coast and Islands of Bass Strait", Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288)

## Columns

	Name		Data Type	Meas.	Description
	----		---------	-----	-----------
	Sex		nominal			M, F, and I (infant)
	Length		continuous	mm	Longest shell measurement
	Diameter	continuous	mm	perpendicular to length
	Height		continuous	mm	with meat in shell
	Whole weight	continuous	grams	whole abalone
	Shucked weight	continuous	grams	weight of meat
	Viscera weight	continuous	grams	gut weight (after bleeding)
	Shell weight	continuous	grams	after being dried
	Rings		integer			+1.5 gives the age in years

In [124]:
df.describe()

Unnamed: 0,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,shellWeight,rings
count,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0,4176.0
mean,0.524009,0.407892,0.139527,0.828818,0.3594,0.180613,0.238852,9.932471
std,0.120103,0.09925,0.041826,0.490424,0.22198,0.10962,0.139213,3.223601
min,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,0.45,0.35,0.115,0.4415,0.186,0.093375,0.13,8.0
50%,0.545,0.425,0.14,0.79975,0.336,0.171,0.234,9.0
75%,0.615,0.48,0.165,1.15325,0.502,0.253,0.329,11.0
max,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [125]:
df.gender.value_counts()

M    1527
I    1342
F    1307
Name: gender, dtype: int64

In [126]:
dfmf = df[df.gender != 'I']

In [127]:
dfmf.head()

Unnamed: 0,gender,length,diameter,height,wholeWeight,shuckedWeight,visceraWeight,shellWeight,rings
0,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
1,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
2,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
5,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
6,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16


In [128]:
X = dfmf.drop('gender', 1)
y = dfmf['gender']

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25, random_state=100)
print("Training size: {}: Test Size: {}".format(len(train), len(test)))

Training size: 3549: Test Size: 627


In [130]:
print(time.asctime())

decision_tree = tree.DecisionTreeClassifier(
    criterion='entropy',
    max_features=4,
    max_depth=6,
)
decision_tree.fit(X, y)

# Render our tree.
dot_data = tree.export_graphviz(
    decision_tree, out_file=None,
    feature_names=X.columns,
#     class_names=['Not Returning', 'Returning'],
    filled=True
)
graph = pydotplus.graph_from_dot_data(dot_data)
Image(graph.create_png())

print(time.asctime())

Fri Jun  1 18:36:33 2018
Fri Jun  1 18:36:34 2018


In [131]:
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
            max_features=4, max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [132]:
y_pred = decision_tree.predict(X_test)

In [133]:
accuracy_score(y_test, y_pred)

0.5331452750352609

I ran several combinations

|features|depth|score|
|----------|-------|-------|
|1|4|54.72%|
|1|5|53.31%|
|1|6|53.32%|
|2|4|54.44%|
|2|5|53.03%|
|2|6|53.03%|
|3|4|52.32%|
|3|5|53.03%|
|3|6|54.02%|
|4|4|52.61%|
|4|5|53.46%|
|4|6|52.61%|

Clearly, there is no discernible difference in predicting the gender of the mollusk depending on the number of features used (from 1 to 4) or depth of the tree (from 4 to 6). With only two options, male and female, these results are clearly no better than a coin toss! Let's try a random forest.

Oh, and the tree took 1 second to run. 

In [134]:
print(time.asctime())

rfc = ensemble.RandomForestClassifier()
# X = y2015.drop('loan_status', 1)
# Y = y2015['loan_status']
# X = pd.get_dummies(X)
# X = X.dropna(axis=1)

cross_val_score(rfc, X, y, cv=10)

Fri Jun  1 18:36:34 2018


array([0.45422535, 0.5528169 , 0.52112676, 0.47887324, 0.49295775,
       0.58802817, 0.52464789, 0.4822695 , 0.52836879, 0.53900709])

In [135]:
print(time.asctime())

Fri Jun  1 18:36:35 2018


So let's take a moment to recognize the impressive worthlessness of this dataset. The Random Forest produced results that are no better than the several iterations of the decision tree. We are functionally no better at predicting abalone gender than a coin toss. And here's why. The dataset was originally intended to determine the AGE of the abalones using the gender, size, and shape with the number of rings grown as the indicator of age. However, the "rings" values are integers, not categorical. And random forest and decision trees are meant to predict categorical data. Sure, I could create a threshold to say "older than X" and "younger than X" and format the dataset to predict that. But it's really no more informative a model. So here we have a case of a dataset that was never meant to do what I tried to make it do. 

Also, the random forest was processed in 1 second as well. I'm sure that a much larger dataset would take well longer.

But the code works and it proves the point. Still I think it's necessary to find another dataset, one that allows me to actually predict something. I'm submitting this Notebook. But I DO NOT consider this assignment complete. 