# Classify plant species using SVM with PCA dimensionality reduction

In [3]:
#Script to read neon plant data and classify plant species using an SVM 

from sklearn.svm import SVC
import pandas as pd
import re
import numpy as np
from sklearn import decomposition
# read file
fileName = "../derived-data/plants_merged.csv"
df = pd.read_csv(fileName, skiprows = 0)

In [4]:
#Extract taxonid column and assign an integer to each unique species this becomes y vector
def getTarget(df):
	species = df['taxonid']
	labels, levels = pd.factorize(species)
	y = labels
	return y

#Extract all columns to be used as features in the svm
def getFeatures(df):
	ident = 'nm_'
	names = ['chm_height']
	for column in df:
		if re.match(ident, column):
			names.append(column)
	X = df[names]
	X = X.as_matrix()
	return X

y = getTarget(df)
X = getFeatures(df)

#Normalize features
def featureNorm(X):
	X_shape = X.shape
	X_norm = X
	mu = np.mean(X, axis=1)
	sigma = np.std(X, axis = 1)
	for i in range(X_shape[1]):
		X_norm[:,i] = (X[:,i] - mu[i])/sigma[i]
	return X_norm

def reduceDims(X):
	variance = 0
	n_components = 2
	while(variance <= .99):
		pca = decomposition.PCA(n_components=n_components)
		pca.fit(X)
		variance = pca.explained_variance_ratio_.cumsum()[-1]
		print('Variance retained at', n_components, 'dimensions: ', variance)
		n_components = n_components + 1
	X = pca.transform(X)
	return X

#Randomize row order
def randomizeVals(X, y):
	X_shape = X.shape
	arr = np.column_stack((X, y))
	np.random.shuffle(arr)
	X = arr[:,0:X_shape[1]]
	y = arr[:,-1]
	return(X, y)

#Seperate data into a training set, cross validation set and test set
def getSets(X, y):
	train_stop = round(.6 * y.size)
	cv_stop = y.size - round(.2 * y.size)
	X_train = X[0:train_stop, :]
	y_train = y[0:train_stop]
	X_cv = X[train_stop + 1:cv_stop, :]
	y_cv = y[train_stop + 1:cv_stop]
	X_test = X[cv_stop + 1:-1, :]
	y_test = y[cv_stop + 1:-1]
	return(X_train, X_test, X_cv, y_train, y_test, y_cv)

X = featureNorm(X)
X = reduceDims(X)
(X, y) = randomizeVals(X, y)
(X_train, X_test, X_cv, y_train, y_test, y_cv) = getSets(X, y)

Variance retained at 2 dimensions:  0.898208384178
Variance retained at 3 dimensions:  0.952364005642
Variance retained at 4 dimensions:  0.982838937363
Variance retained at 5 dimensions:  0.992810579491


In [5]:
#optemize constant parameters of cost function on the cross validation set
def findParams(X_train, y_train, X_cv, y_cv):
	accuracy = 0
	params = np.array([.01, .03, .1, .3, 1, 3, 10, 30])
	for i in range(params.size):
		for j in range(params.size):
			C = params[i]
			gamma = params[j]
			clf = SVC(C=C, gamma=gamma, decision_function_shape='ovr')
			clf.fit(X_train, y_train)
			temp_acc = clf.score(X_test, y_test)
			if temp_acc > accuracy:
				clf_ideal = clf
				C_ideal = C
				gamma_ideal = gamma
				accuracy = temp_acc
	print("C:", C_ideal, "gamma:", gamma_ideal)
	return clf_ideal

#check accuracy
clf_ideal = findParams(X_train, y_train, X_cv, y_cv)
print("Test Accuracy:", clf_ideal.score(X_test, y_test))

C: 10.0 gamma: 0.01
Test Accuracy: 0.642857142857
