In [1]:
from sklearn import datasets

# load the iris dataset and print out the keys from the 
# corresponding dictionary
iris = datasets.load_iris()

for key in iris.keys():
    print(key)
    
#print(iris['data'])
#print(iris['target'])

data
target
target_names
DESCR
feature_names
filename


In [2]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier and train it on all data
model = DecisionTreeClassifier()
model.fit(iris['data'], iris['target'])
print(model)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')


In [3]:
from sklearn import metrics

actual = iris['target']
predicted = model.predict(iris['data'])

# Create a classification report and a confusion matrix

print(metrics.classification_report(actual, predicted))
print(metrics.confusion_matrix(actual, predicted))

# Why is our output perfect?  Because we trained on everything! It
# knows all of the answers already

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


In [4]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Throwing the input into a DataFrame for fun
irisDF = pd.DataFrame(iris['data'], columns=iris['feature_names'])
#print(irisDF)

# train_test_split divides our data into training and testing groups
(x_train, x_test, y_train, y_test) = train_test_split(irisDF, iris['target'], test_size=0.2)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(120, 4) (30, 4)
(120,) (30,)


In [5]:
model.fit(x_train, y_train)
actual = y_test
predicted = model.predict(x_test)

# Now we have other model results based off of real predictions, but
# there's still a risk that there is bias in the data
print(metrics.classification_report(actual, predicted))
print(metrics.confusion_matrix(actual, predicted))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       0.80      0.89      0.84         9
           2       0.89      0.80      0.84        10

    accuracy                           0.90        30
   macro avg       0.90      0.90      0.89        30
weighted avg       0.90      0.90      0.90        30

[[11  0  0]
 [ 0  8  1]
 [ 0  2  8]]


In [6]:
from sklearn.model_selection import cross_val_score, cross_val_predict

# cross-validation lets us use every item as a testing item by
# running multiple passes through the data
scores = cross_val_score(model, irisDF, iris['target'], cv=6)
print(scores)

sum = 0
for score in scores:
    sum += score
print(sum/6)

# we can also get the corresponding predictions from the model output
predicted = cross_val_predict(model, irisDF, iris['target'], cv=5)
actual = iris['target']

print(metrics.classification_report(actual, predicted))
print(metrics.confusion_matrix(actual, predicted))

[0.96 1.   0.92 0.92 0.88 1.  ]
0.9466666666666667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]


In [7]:
from sklearn.cluster import KMeans

# loading and fitting k-means
model = KMeans(n_clusters=3)
model.fit(irisDF)

# dictionary counting pattern to figure out how many flowers are
# in each cluster
freq = {}
for label in model.labels_:
    if label in freq.keys():
        freq[label] += 1
    else:
        freq[label] = 1
        
for key in freq.keys():
    print(key, freq[key])

1 50
0 62
2 38


In [8]:
import tensorflow as tf

# Load and prepare the MNIST dataset.  Convert the samples from
# integers to floating-point [0,1] values
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

x_train, x_test = x_train / 255.0, x_test / 255.0

(60000, 28, 28) (10000, 28, 28)
(60000,) (10000,)


In [9]:
# Build a Sequential model by stacking layers

model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28,28)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])


# Run the first row on the untrained(!) model.  The output is a
# "logit" or "log-odd" score, one for each class.  The softmax
# function commented out converts that into a probability
predictions = model(x_train[:1]).numpy()
#predictions = tf.nn.softmax(predictions)
predictions



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



array([[-0.5671199 ,  0.18190444,  0.23910058, -0.43286905,  0.22094427,
        -0.01396617, -0.00165793,  0.14976072,  0.79003084,  0.5319262 ]],
      dtype=float32)

In [10]:
# We need to create a loss function to penalize the neural network
# training when it gets things wrong

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn(y_train[:1], predictions).numpy()

2.497569

In [11]:
# And now we're actually training our neural network!

model.compile(optimizer="adam", loss=loss_fn, metrics=['accuracy'])
model.fit(x_train, y_train, epochs=5)

Train on 60000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1dec44dcc08>

In [12]:
# Here's the evaluation on the 10000 testing rows

model.evaluate(x_test, y_test, verbose=2)

10000/10000 - 3s - loss: 0.0735 - accuracy: 0.9776


[0.0734720635201782, 0.9776]

In [13]:
# Here, we wrap a Softmax layer into the Sequential model and 
# display the probabilities associated with each of the 10 digit
# possibilities for the first 5 rows

probability_model = tf.keras.Sequential([
    model,
    tf.keras.layers.Softmax()
])

probability_model(x_test[:5])

<tf.Tensor: shape=(5, 10), dtype=float32, numpy=
array([[1.4860373e-07, 4.2159733e-09, 1.9513641e-06, 1.5949721e-04,
        4.7938536e-13, 1.4558890e-07, 1.4161849e-13, 9.9983633e-01,
        4.0216290e-07, 1.5880281e-06],
       [1.0570799e-09, 2.6996585e-03, 9.9721485e-01, 7.8138182e-05,
        7.4671535e-13, 4.5263660e-06, 3.4815821e-07, 7.0389698e-14,
        2.4514325e-06, 6.5205398e-12],
       [2.4273345e-07, 9.9876130e-01, 1.8059496e-04, 1.5558811e-05,
        5.9933529e-05, 2.2573875e-06, 6.9919970e-06, 8.9649140e-04,
        7.5418488e-05, 1.1398979e-06],
       [9.9991524e-01, 1.6450050e-10, 2.5753299e-05, 8.7456719e-08,
        1.4571569e-07, 3.2265541e-07, 2.1557994e-06, 4.6026148e-06,
        1.7943182e-06, 4.9858842e-05],
       [1.3199647e-06, 1.0714654e-09, 2.1574242e-06, 1.4878186e-08,
        9.9883062e-01, 9.2542173e-08, 5.6267220e-07, 1.2629027e-04,
        4.5192987e-08, 1.0389428e-03]], dtype=float32)>