## Forest Cover Type Prediction
### W207 Spring 2020

In [1]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

In [16]:
# Data source: https://www.kaggle.com/c/forest-cover-type-prediction/overview
# Load data

train_path = '../data/train.csv'
unlabeled_path = '../data/test.csv'
train_csv = np.genfromtxt(train_path, delimiter=',', names=True)
unlabeled_csv = np.genfromtxt(unlabeled_path, delimiter=',', names=True)

# Extract feature and label strings
feature_name = unlabeled_csv.dtype.names
label_name = set(train_csv['Cover_Type'])

# Convert from structured array to 2D
unlabeled_data = unlabeled_csv.view((float, len(unlabeled_csv.dtype.names)))
train_csv = train_csv.view((float, len(train_csv.dtype.names)))

In [22]:
# Train, dev, test split (60/20/20)
split1 = int(len(train_csv)* 0.60)
split2 = int(split1 + (len(train_csv) - split1) / 2)

train_data, train_labels = train_csv[:split1,:-1], train_csv[:split1,-1]
dev_data, dev_labels     = train_csv[split1:split2,:-1], train_csv[split1:split2,-1]
test_data, test_labels   = train_csv[split2:,:-1], train_csv[split2:,-1]


print('training label shape:', train_labels.shape)
print('dev label shape:',      dev_labels.shape)
print('test label shape:',     test_labels.shape)
print('labels names:',         label_name)
print('number of features:',   len(feature_name))
print('feature names:',        feature_name)

training label shape: (9072,)
dev label shape: (3024,)
test label shape: (3024,)
labels names: {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}
number of features: 55
feature names: ('Id', 'Elevation', 'Aspect', 'Slope', 'Horizontal_Distance_To_Hydrology', 'Vertical_Distance_To_Hydrology', 'Horizontal_Distance_To_Roadways', 'Hillshade_9am', 'Hillshade_Noon', 'Hillshade_3pm', 'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area1', 'Wilderness_Area2', 'Wilderness_Area3', 'Wilderness_Area4', 'Soil_Type1', 'Soil_Type2', 'Soil_Type3', 'Soil_Type4', 'Soil_Type5', 'Soil_Type6', 'Soil_Type7', 'Soil_Type8', 'Soil_Type9', 'Soil_Type10', 'Soil_Type11', 'Soil_Type12', 'Soil_Type13', 'Soil_Type14', 'Soil_Type15', 'Soil_Type16', 'Soil_Type17', 'Soil_Type18', 'Soil_Type19', 'Soil_Type20', 'Soil_Type21', 'Soil_Type22', 'Soil_Type23', 'Soil_Type24', 'Soil_Type25', 'Soil_Type26', 'Soil_Type27', 'Soil_Type28', 'Soil_Type29', 'Soil_Type30', 'Soil_Type31', 'Soil_Type32', 'Soil_Type33', 'Soil_Type34', 'Soil_Type35', '

In [24]:
# k-NN

# search for an optimal value of K for KNN
k_range = [1,4,7]

# list of scores from k_range
k_scores = []

def KNN(k):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(train_data, train_labels)
    model_pred = model.predict(dev_data)
    return model_pred

for k in k_range:
    score = metrics.f1_score(dev_labels, KNN(k), average="weighted")
    k_scores.append(score)
    print("The f1 score for {}-NN is {}".format(k, score))

The f1 score for 1-NN is 0.25072455691163237
The f1 score for 4-NN is 0.2402678559200141
The f1 score for 7-NN is 0.24583741730652214
