In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

**1. Load the Covertype Data Set**

In [1]:
from sklearn.datasets import fetch_covtype
covtype = fetch_covtype()

In [3]:
df = pd.DataFrame(data=covtype.data, 
                  columns=covtype.feature_names)

In [None]:
df.head()

Unnamed: 0,Elevation,Aspect,Slope,Horizontal_Distance_To_Hydrology,Vertical_Distance_To_Hydrology,Horizontal_Distance_To_Roadways,Hillshade_9am,Hillshade_Noon,Hillshade_3pm,Horizontal_Distance_To_Fire_Points,...,Soil_Type_30,Soil_Type_31,Soil_Type_32,Soil_Type_33,Soil_Type_34,Soil_Type_35,Soil_Type_36,Soil_Type_37,Soil_Type_38,Soil_Type_39
0,2596.0,51.0,3.0,258.0,0.0,510.0,221.0,232.0,148.0,6279.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2590.0,56.0,2.0,212.0,-6.0,390.0,220.0,235.0,151.0,6225.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2804.0,139.0,9.0,268.0,65.0,3180.0,234.0,238.0,135.0,6121.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2785.0,155.0,18.0,242.0,118.0,3090.0,238.0,238.0,122.0,6211.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2595.0,45.0,2.0,153.0,-1.0,391.0,220.0,234.0,150.0,6172.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


**2. Implement a very simple heuristic that will classify the data**

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(covtype.data, covtype.target, test_size=0.3, random_state=42)

In [13]:
# Train a decision tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train, y_train)

# Evaluate the classifier on the test set
accuracy = decision_tree.score(X_test, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100))


Accuracy: 93.41%


In [None]:
from dtreeviz.trees import dtreeviz
import matplotlib.pyplot as plt

viz = dtreeviz(decision_tree, X_train, y_train, target_name='class', feature_names=covtype.feature_names, class_names=['1', '2', '3', '4', '5', '6', '7'], orientation='TD')
plt.show()

**3. Use Scikit-learn library to train two simple Machine Learning models**

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train a random forest classifier
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train)
# Evaluate the random forest classifier on the testing set
y_pred_rfc = rfc.predict(X_test)
acc_rfc = accuracy_score(y_test, y_pred_rfc)
print(f"Accuracy of random forest classifier: {acc_rfc:.4f}")

Accuracy of random forest classifier: 0.9516


In [None]:
# Visualize the decision trees in the random forest
for i, tree in enumerate(random_forest.estimators_):
    viz = dtreeviz(tree, X_train, y_train, target_name='class', feature_names=covtype.feature_names, class_names=['1', '2', '3', '4', '5', '6', '7'], orientation='TD')
    viz.save(f'random_forest_tree_{i}.svg')

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_score = knn.score(X_test, y_test)
print(f"k-Nearest Neighbors Accuracy: {knn_score:.2f}")

k-Nearest Neighbors Accuracy: 0.97


In [None]:
#data visualization
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import fetch_covtype
import matplotlib.pyplot as plt
import numpy as np

# Select two features for visualization
feature_1 = 0
feature_2 = 1

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(covtype.data[:, [feature_1, feature_2]], covtype.target, test_size=0.3, random_state=42)

# Train a k-nearest neighbors classifier with k=3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Plot the decision boundary
x_min, x_max = X_train[:, 0].min() - 1, X_train[:, 0].max() + 1
y_min, y_max = X_train[:, 1].min() - 1, X_train[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1), np.arange(y_min, y_max, 0.1))
Z = knn.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, alpha=0.4)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, s=20, edgecolor='k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.xlabel(covtype.feature_names[feature_1])
plt.ylabel(covtype.feature_names[feature_2])
plt.title('KNN decision boundary')
plt.show()

**4. Use TensorFlow library to train a neural network that will classify the data** </p>
○ Create a function that will find a good set of hyperparameters for the NN </p>
○ Plot training curves for the best hyperparameters 

In [None]:
import tensorflow as tf
from sklearn.datasets import fetch_covtype
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

X = covtype.data
y = covtype.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
y_train = tf.one_hot(y_train, depth=7)
y_test = tf.one_hot(y_test, depth=7)


# Scale the data


# Define the neural network model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(256, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(128, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(7, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adagrad',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model on the testing set
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.6978304982185364


In [None]:
import matplotlib.pyplot as plt

# Train the model and keep track of the training history
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Plot the training and validation accuracy over time
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.title('Model accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Plot the training and validation loss over time
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.title('Model loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()