In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing,cross_validation,svm 
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline

# Import the data

In [78]:
data = np.genfromtxt('DeerHunter.csv', delimiter=',')

In [79]:
df = pd.read_csv('DeerHunter.csv')
df.head()

Unnamed: 0,wtdeer,state,urban,race,retire,employ,educ,married,income,gender,...,huntexp,agehunt,trips,bagdeer,numbag,bagbuck,avgcost,totcost,a,yes
0,1.0,1,1,1,1,1,0,1,0,1,...,0,0,0,1,1,1,0.0,0,0,1
1,0.60193,26,3,1,0,1,12,1,15000,1,...,11,7,75,1,1,1,15.0,1125,139,1
2,0.920266,48,3,1,2,0,11,0,27500,1,...,8,10,3,1,1,1,10.0,30,27,1
3,0.339394,13,3,1,0,1,20,1,15000,1,...,6,12,5,1,1,1,10.0,50,45,1
4,0.808089,43,1,1,2,0,20,0,5000,1,...,10,8,15,0,0,0,5.0,75,491,1


# Separating the data into label and features

In [80]:
x = data[1:, 0:20]
y = data[1:, 20:]
y = y.ravel()

In [81]:
x.shape

(6060, 20)

In [82]:
y.shape

(6060,)

In [83]:
scaler = MinMaxScaler().fit(x)

# Train Test Split

Split the data into a training set and a testing set.


In [84]:
#Split your data into a training set and a testing set.
from sklearn.model_selection import train_test_split

In [112]:
X_train, X_test,y_train,y_test = train_test_split(x, y, test_size=0.30,random_state=42)

# Train a Model

Now its time to train a Support Vector Machine Classifier.

In [113]:
from sklearn.svm import SVC

In [114]:
model = SVC().fit(X_train, y_train)

# Model Evaluation

Now get predictions from the model and create a confusion matrix and a classification report

In [115]:
prediction = model.predict(X_test)

In [116]:
from sklearn.metrics import classification_report,confusion_matrix

In [117]:
print(confusion_matrix(y_test,prediction))

[[1030    1]
 [ 784    3]]


In [118]:
print(classification_report(y_test,prediction))

             precision    recall  f1-score   support

        1.0       0.57      1.00      0.72      1031
        2.0       0.75      0.00      0.01       787

avg / total       0.65      0.57      0.41      1818



**We see that the svc model classified the data with an accuracy of 65%, we will use we will use grid search to find the optimum values of the parameters C and gamma if there is any improvement in the performance of our svc model** 

In [119]:
from sklearn.grid_search import GridSearchCV
param_grid = { 'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}
clf = GridSearchCV(SVC(),param_grid,verbose=3)

In [120]:
clf.fit(X_train, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.573145 -   1.0s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] ......................... C=0.1, gamma=1, score=0.572843 -   1.1s
[CV] C=0.1, gamma=1 ..................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s


[CV] ......................... C=0.1, gamma=1, score=0.573248 -   1.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.573145 -   1.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.572843 -   1.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.573248 -   1.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.573145 -   1.1s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.572843 -   1.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...................... C=0.1, gamma=0.01, score=0.573248 -   1.0s
[CV] C=0.1, gamma=0.001 ..............................................
[CV] .

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:  1.5min finished


GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=3)

**Let print the best parameters our grid search has found **


In [121]:
clf.best_params_

{'C': 1, 'gamma': 0.0001}

In [122]:
clf.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0001, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

**Lets predict with the new found optimised parameters**

In [123]:
clf_prediction = clf.predict(X_test)

In [124]:
print(classification_report(y_test,clf_prediction))

             precision    recall  f1-score   support

        1.0       0.73      0.75      0.74      1031
        2.0       0.66      0.63      0.64       787

avg / total       0.70      0.70      0.70      1818

