# Select the Best Classifier for Place Names labeling

Modules need to be imported

In [1]:
from pandas import read_csv
import pydotplus
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn import linear_model

from _search import ParamSearch

Methods need for the classifiers

In [2]:
# Read feature vectors into numpy array
def read_vectors(filename):
	# Read raw table from file
	df = read_csv(filename,error_bad_lines=False)
	feature_keys = list(df.keys())
	# test soundex
	# feature_keys.remove('soundex')
	feature_keys.remove('is_target')
	data = []
	for i in range(len(df.values)):
		new_row = []
		for key in feature_keys:

			new_row += [df[key][i]]
		data += [new_row]
	# for d in data:
	# 	print(d)
	target = list(df['is_target'].values)
	return {'data':data, 'target':target, 'feature':feature_keys}


In [3]:
# Performs model training
def train_model(dataset):
	clf = svm.SVC()
	clf.fit(dataset['data'], dataset['target'])

In [4]:
# perform model selection (new version)
def model_select(param_grid, X, y, numFolds):
	verbose = param_grid.pop('verbose', False)
	warnings = param_grid.pop('warnings', True)
	processes = param_grid.pop('processes', True)
	batch_size = param_grid.pop('batch_size', True)

	mycv = StratifiedKFold(n_splits=numFolds, shuffle=True)
	best_params = {}
	for key in sorted(param_grid):
		print('-------------------------------------------------')
		print('{} classifier parameter searching'.format(key))
		print()
		clf = ParamSearch(eval(key)(), param_grid[key], cv=mycv, 
							warnings=warnings, verbose=verbose, batch_size=batch_size)
		clf.grid_search(X, y, processes=processes)
		best_params[key] = clf.best_param
	return best_params

In [5]:
def param_grid_gen(tune_one_clf=False):
	# SVM Classifier
	svmList = {'kernel':['rbf'], 'gamma':[1e-5, 1e-3, 1], 'C':[0.01, 0.1, 1, 10]}

	# Random Forest Classifier
	rfList = [
		{'n_estimators': [10, 30, 50], 'criterion':['gini','entropy'],'max_features':[0.2, 0.5,'sqrt','log2',None]},
		{'max_depth': [5, 10, None], 'min_samples_split': [2, 5, 10, 20],'min_samples_leaf': [2, 5, 10, 20]},
		{'n_estimators': [10, 30, 50], 'min_weight_fraction_leaf': [0, 0.01], 'min_impurity_split': [1e-7, 1e-5, 1e-3]},
		{'n_estimators': [10, 30, 50], 'bootstrap': [True, False], 'class_weight': ['balanced', None]} ]

	# Logistic Regression Classifier
	logisList = [{'penalty':['l1','l2'], 'C':[1e-3, 1e-2,1,10],
				  'solver':['liblinear'], 'class_weight':['balanced',None]},
				  {'C':[1e-3, 1e-2,1,10], 'class_weight':['balanced',None],
				  'solver':['newton-cg','lbfgs','sag']}]

	# Decision Tree Classifier
	treeList = [{'criterion':['gini','entropy'], 'splitter':['best','random'],
				'max_features':[0.2,0.5,'sqrt','log2',None]},
				{'max_depth':[5,10,None], 'min_samples_split':[2,5,10,20],
				'min_samples_leaf':[2,5,10,20]},
				{'min_weight_fraction_leaf':[0, 0.01],
				'class_weight':['balanced',None], 'min_impurity_split':[1e-7,1e-5,1e-3]}]

	param_grid = {}
	param_grid.update({'verbose': True})
	param_grid.update({'warnings': False})
	param_grid.update({'processes': 8})
	param_grid.update({'batch_size': 30})
	if not tune_one_clf:
		param_grid.update({'svm.SVC': svmList})
		param_grid.update({'RandomForestClassifier': rfList})
		param_grid.update({'tree.DecisionTreeClassifier': treeList})
		param_grid.update({'linear_model.LogisticRegression':logisList})
	else:
		rfList = [
			{'n_estimators': [10, 30, 50], 'criterion': ['gini', 'entropy'],
			 'max_features': [0.2, 0.5, 'sqrt', 'log2', None]},
			{'max_depth': [5, 10, None], 'min_samples_split': [2, 5, 10, 20],
			 'min_samples_leaf': [2, 5, 10, 20]},
			{'n_estimators': [10, 30, 50], 'min_weight_fraction_leaf': [0, 0.01],
			 'min_impurity_split': [1e-7, 1e-5, 1e-3]},
			{'n_estimators': [10, 30, 50], 'bootstrap': [True, False], 'class_weight': ['balanced', None]}]
		param_grid.update({'RandomForestClassifier': rfList})
	return param_grid

In [6]:
def print_stat(clf, X, y, split_ratio=0.2, feature_names=None, printTree=False):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_ratio, random_state=10, stratify=y)
	clf.fit(X_train, y_train)
	# Confusion matrix on training set
	y_true, y_predict = y_train, clf.predict(X_train)
	print('Training Set Performance')
	print(confusion_matrix(y_true, y_predict))
	# Confusion matrix on testing set
	y_true, y_predict = y_test, clf.predict(X_test)
	print("Testing Set Performance")
	print(confusion_matrix(y_true, y_predict))
	print('Classification report on testing set')
	print('precision score: {}'.format(precision_score(y_true, y_predict)))
	print('Scores:\n{}'.format(classification_report(y_true, y_predict)))
	## only for decision tree
	if printTree:
		dot_data = tree.export_graphviz(clf, out_file=None,feature_names=feature_names)
		graph = pydotplus.graph_from_dot_data(dot_data)
		graph.write_pdf("tree.pdf")

In [7]:
def linear_regression(X, y, folds=5, threshold=0.5):
	print("linear regression:")
	reg = linear_model.LinearRegression()
	skf = StratifiedKFold(n_splits=folds, shuffle=True)
	precision = []
	recall = []
	f1 = []
	X = np.array(X)
	y = np.array(y)
	for train, test in skf.split(X, y):
		X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]
		reg.fit(X_train,y_train)
		y_pred = reg.predict(X_test)
		y_pred[y_pred>=threshold] = 1
		y_pred[y_pred<threshold] = 0
		precision.append(precision_score(y_test, y_pred))
		recall.append(recall_score(y_test,y_pred))
		f1.append(f1_score(y_test,y_pred))
		print(confusion_matrix(y_test, y_pred))
	print("{} fold cross validation score:".format(folds))
	print("precision:{:6.3f} +/-{:6.3f} recall:{:6.3f} +/-{:6.3f}".format(np.mean(precision), np.std(precision), np.mean(recall), np.std(recall)))
	print("f1:{:6.3f} +/-{:6.3f}".format(np.mean(f1), np.std(f1)))

In [8]:
def test_model(X_test, y_test, clf):
	print('Testing trained model of testing set:')
	y_true, y_predict = y_test, clf.predict(X_test)
	print("Testing Set Performance")
	print(confusion_matrix(y_true, y_predict))
	print('Classification report on testing set')
	print('Scores:\n{}'.format(classification_report(y_true, y_predict)))

## Import Development Set Data

In [9]:
filename = '../data/devl_set_feature_vectors.txt'
dataSet = read_vectors(filename)
X, y = dataSet['data'], dataSet['target']

### Print the decision tree out

In [10]:
print_stat(tree.DecisionTreeClassifier(), X, y, feature_names=dataSet['feature'],printTree=True)

Training Set Performance
[[2593    0]
 [   0 1297]]
Testing Set Performance
[[603  46]
 [ 28 296]]
Classification report on testing set
precision score: 0.865497076023
Scores:
             precision    recall  f1-score   support

        0.0       0.96      0.93      0.94       649
        1.0       0.87      0.91      0.89       324

avg / total       0.93      0.92      0.92       973



### Linear Regression

In [11]:
linear_regression(X, y, folds=5, threshold=0.5)

linear regression:
[[593  56]
 [ 13 312]]
[[583  66]
 [ 13 311]]
[[588  60]
 [  7 317]]
[[570  78]
 [ 13 311]]
[[590  58]
 [ 16 308]]
5 fold cross validation score:
precision: 0.831 +/- 0.017 recall: 0.962 +/- 0.009
f1: 0.891 +/- 0.011




## Model Selection
* Four Classifiers are tested with a serials of parameters 
* The parameters are generated using param_grid_gen

In [12]:
params = param_grid_gen(tune_one_clf=True)
best_params = model_select(params, X, y, 5)
for key in sorted(best_params):
    print('-------------------------------------------------')
    print('{} classifier'.format(key))
    print()
    print(best_params[key])

-------------------------------------------------
RandomForestClassifier classifier parameter searching
()
Precision  0.907 +/- 0.024  Recall  0.946 +/- 0.022 for 
{'max_features': 0.2, 'n_estimators': 10, 'criterion': 'gini'}
Precision  0.900 +/- 0.029  Recall  0.947 +/- 0.028 for 
{'min_samples_split': 2, 'max_depth': 5, 'min_samples_leaf': 2}

Precision  0.904 +/- 0.024  Recall  0.922 +/- 0.029 for 
{'min_samples_split': 5, 'max_depth': 5, 'min_samples_leaf': 2}
Precision  0.903 +/- 0.012  Recall  0.961 +/- 0.013 for 
{'max_features': 0.2, 'n_estimators': 30, 'criterion': 'gini'}Precision  0.903 +/- 0.018  Recall  0.936 +/- 0.022 for 
{'min_samples_split': 10, 'max_depth': 5, 'min_samples_leaf': 2}

Precision  0.916 +/- 0.012  Recall  0.891 +/- 0.071 for 
{'min_samples_split': 20, 'max_depth': 5, 'min_samples_leaf': 2}
Precision  0.908 +/- 0.027  Recall  0.931 +/- 0.030 for 
{'min_samples_split': 10, 'max_depth': 10, 'min_samples_leaf': 20}
Precision  0.920 +/- 0.009  Recall  0.864 

## Here we test different parameter of Random Forest
We split development set into P set and Q set: P set for training and Q set for testing

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=3, stratify=y)

Because Random Forest gives different results every run. So the average is taken after 20 runs.

In [220]:
precision = []
recall = []
f1 = []
clf_matrix=[]
for i in range(20):
    clf=RandomForestClassifier(n_estimators=30,min_samples_split=5,max_depth=5,max_features='log2',
                              min_samples_leaf=5)
    clf.fit(X_train, y_train)
    y_true, y_predict = y_test, clf.predict(X_test)
    precision.append(precision_score(y_true, y_predict))
    recall.append(recall_score(y_true, y_predict))
    f1.append(f1_score(y_true,y_predict))
    clf_matrix.append(confusion_matrix(y_true,y_predict))
print('precision: {:6.3f} (+/-{:6.3f})'.format(np.mean(precision),np.std(precision)))
print('recall: {:6.3f} (+/-{:6.3f})'.format(np.mean(recall),np.std(recall)))
print('f1: {:6.3f} (+/-{:6.3f})'.format(np.mean(f1),np.std(f1)))

precision:  0.920 (+/- 0.013)
recall:  0.908 (+/- 0.041)
f1:  0.913 (+/- 0.016)


In [224]:
print(np.mean(clf_matrix,axis=0))
print(np.std(clf_matrix,axis=0))

[[ 1556.25    64.75]
 [   74.45   736.55]]
[[ 13.63405662  13.63405662]
 [ 33.0673177   33.0673177 ]]


After parameter tuning, we need to re-verify the parameters by cross validation

In [240]:
print('Reverify the parameters by CV')
my_cv = StratifiedKFold(n_splits=5, shuffle=True)
clf=RandomForestClassifier(n_estimators=30,min_samples_split=5,max_depth=5,max_features='log2',
                              min_samples_leaf=5)
precision_scores = cross_val_score(clf,X, y, cv=my_cv, scoring='precision' )
recall_scores = cross_val_score(clf, X, y, cv=my_cv, scoring='recall' )
f1_scores = cross_val_score(clf, X, y, cv=my_cv, scoring='f1' )
print('cross_validaiton score')
print('precision: {} (+/-{})'.format(np.mean(precision_scores),np.std(precision_scores)))
print('recall: {} (+/-{})'.format(np.mean(recall_scores),np.std(recall_scores)))
print('f1: {} (+/-{})'.format(np.mean(f1_scores),np.std(f1_scores)))

Reverify the parameters by CV
cross_validaiton score
precision: 0.9172239240675054 (+/-0.021584022423820425)
recall: 0.9296410256410257 (+/-0.04675534923054424)
f1: 0.9080189466431943 (+/-0.009671176748588209)


## Test the trained model on the test dataset

In [225]:
filename = '../data/test_set_feature_vectors.txt'
dataSet = read_vectors(filename)
X_test, y_test = dataSet['data'], dataSet['target']
clf = RandomForestClassifier(max_depth=4, max_features=0.2)
clf.fit(X,y)
test_model(X_test,y_test,clf)

Testing trained model of testing set:
Testing Set Performance
[[1804   78]
 [  51  890]]
Classification report on testing set
Scores:
             precision    recall  f1-score   support

        0.0       0.97      0.96      0.97      1882
        1.0       0.92      0.95      0.93       941

avg / total       0.95      0.95      0.95      2823



In [234]:
clf.fit(X,y)
test_model(X_test,y_test,clf)

Testing trained model of testing set:
Testing Set Performance
[[1824   58]
 [  91  850]]
Classification report on testing set
Scores:
             precision    recall  f1-score   support

        0.0       0.95      0.97      0.96      1882
        1.0       0.94      0.90      0.92       941

avg / total       0.95      0.95      0.95      2823



The average score of 20 runs

In [226]:
precision = []
recall = []
f1 = []
clf_matrix=[]
for i in range(20):
    clf=RandomForestClassifier(n_estimators=30,min_samples_split=5,max_depth=5,max_features='log2',
                              min_samples_leaf=5)
    clf.fit(X, y)
    y_true, y_predict = y_test, clf.predict(X_test)
    precision.append(precision_score(y_true, y_predict))
    recall.append(recall_score(y_true, y_predict))
    f1.append(f1_score(y_true,y_predict))
    clf_matrix.append(confusion_matrix(y_true,y_predict))
print('precision: {:6.3f} (+/-{:6.3f})'.format(np.mean(precision),np.std(precision)))
print('recall: {:6.3f} (+/-{:6.3f})'.format(np.mean(recall),np.std(recall)))
print('f1: {:6.3f} (+/-{:6.3f})'.format(np.mean(f1),np.std(f1)))
print(np.mean(clf_matrix,axis=0))
print(np.std(clf_matrix,axis=0))

precision:  0.935 (+/- 0.011)
recall:  0.902 (+/- 0.052)
f1:  0.917 (+/- 0.023)
[[ 1822.65    59.35]
 [   91.85   849.15]]
[[ 13.82850317  13.82850317]
 [ 48.9512768   48.9512768 ]]
