In [1]:
# Selected the data from repository link: http://archive.ics.uci.edu/ml/datasets/Vertebral+Column
# Prepare the data for ML models

import pandas as pd
spine = pd.read_csv("spinal_b.csv", header=None, names=["Predictor1", "Predictor2","Predictor3",
                                                        "Predictor4","Predictor5","Predictor6","Abnormal"])
d = {'AB': 1, 'NO': 0}
spine['Abnormal'] = spine['Abnormal'].map(d)

In [2]:
# Instantiate the model
# m = Model () used for  logistic regression
# Choose predictors and split the data

import sklearn.cross_validation as cv
import numpy as np
from sklearn import linear_model
logit = linear_model.LogisticRegression()
import sklearn.grid_search as gs

X = np.array(spine.iloc[:, 0:6])
Y = np.ravel(spine.iloc[:, -1])

x_train, x_test, y_train, y_test = cv.train_test_split(X, 
                                                       Y, 
                                                       test_size=1.0/3, 
                                                       random_state=0)

In [3]:
# Use Grid Search function to select the best parameters

para_grid = [{'penalty': ['l1', 'l2'], 'fit_intercept': [False, True], 
              'C':np.logspace(-5, 5, 100)}]
para_search = gs.GridSearchCV(logit, para_grid, cv = 5, scoring='accuracy')
para_search.fit(x_train, y_train) 

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'penalty': ['l1', 'l2'], 'C': array([  1.00000e-05,   1.26186e-05, ...,   7.92483e+04,   1.00000e+05]), 'fit_intercept': [False, True]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [4]:
#Use the best parameters and Evaluate it
#m.score(train_data)

logit_best = para_search.best_estimator_
logit_best.fit(x_train, y_train)

print "Training accuracy: %.5f" %logit_best.score(x_train, y_train)
print "Testing accuracy: %.5f" %logit_best.score(x_test, y_test)

Training accuracy: 0.87379
Testing accuracy: 0.80769


In [5]:
# Trying to improve accuracy with an ensemble model
# Instantiate the model
# m = Model () used for  Random Forest
# Use Grid Search function to select the best parameters

from sklearn import ensemble
randomForest = ensemble.RandomForestClassifier()

grid_para_forest = [{"n_estimators": [10, 50, 100], "criterion": ["gini", "entropy"], \
                    "min_samples_leaf": range(1, 10), "min_samples_split": np.linspace(2, 30, 15)}]

grid_search_forest = gs.GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=5)
grid_search_forest.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'n_estimators': [10, 50, 100], 'min_samples_split': array([  2.,   4.,   6.,   8.,  10.,  12.,  14.,  16.,  18.,  20.,  22.,
        24.,  26.,  28.,  30.]), 'criterion': ['gini', 'entropy'], 'min_samples_leaf': [1, 2, 3, 4, 5, 6, 7, 8, 9]}],
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [6]:
#Use the best parameters and Evaluate it
#m.score(train_data)
forest_best=grid_search_forest.best_estimator_
forest_best.fit(x_train,y_train)
print "Training accuracy: %.5f" %forest_best.score(x_train, y_train)
print "Testing accuracy: %.5f" %forest_best.score(x_test, y_test)

Training accuracy: 0.91748
Testing accuracy: 0.82692


In [7]:
# Trying to improve test accuracy with Support Vector Machines (SVM)
# Instantiate the model
# m = Model () used for  SVM
# Use Grid Search function to select the best parameters for degree of 1

from sklearn import svm
svm_model = svm.SVC()
grid_para_svm = [{'C': [1, 10, 100, 1000,10000,100000], 'kernel': ['poly'], 'degree': [1]}]
grid_search_svm = gs.GridSearchCV(svm_model, grid_para_svm)

grid_search_svm.fit(x_train,y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid=[{'kernel': ['poly'], 'C': [1, 10, 100, 1000, 10000, 100000], 'degree': [1]}],
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [8]:
#Use the best parameters and Evaluate it
#m.score(train_data)

svm_best=grid_search_svm.best_estimator_
svm_best.fit(x_train,y_train)
print "Training accuracy: %.5f" %svm_best.score(x_train, y_train)
print "Testing accuracy: %.5f" %svm_best.score(x_test, y_test)

Training accuracy: 0.87864
Testing accuracy: 0.82692
