## Imports

In [5]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import numpy as np
import pandas as pd
import graphviz
from sklearn import *

## Read in data

In [6]:
df = pd.read_csv('letter-recognition.data', header=None)
df['target'] = df[0].apply(lambda x: ord(x) - ord('A'))
del df[0]
print df.dtypes
print df.head()

1         int64
2         int64
3         int64
4         int64
5         int64
6         int64
7         int64
8         int64
9         int64
10        int64
11        int64
12        int64
13        int64
14        int64
15        int64
16        int64
target    int64
dtype: object
   1   2  3  4  5   6   7  8  9  10  11  12  13  14  15  16  target
0  2   8  3  5  1   8  13  0  6   6  10   8   0   8   0   8      19
1  5  12  3  7  2  10   5  5  4  13   3   9   2   8   4  10       8
2  4  11  6  8  6  10   6  2  6  10   3   7   3   7   3   9       3
3  7  11  6  6  3   5   9  4  6   4   4  10   6  10   2   8      13
4  2   1  3  1  1   8   6  6  6   6   5   9   1   7   5  10       6


In [22]:
data = df.values
np.random.shuffle(data)

def split_data(data, pct_train=None, x_cols=[0], y_cols=[1], shuffle_data=False):
    if shuffle_data:
        np.random.shuffle(data)
    
    x = data[:, x_cols]
    y = data[:, y_cols]
    
    train_x, train_y, test_x, test_y = None, None, None, None
    if pct_train != None:
        r, c = data.shape
        train_rows = int(pct_train * r)
        train_x = x[:train_rows, :]
        train_y = y[:train_rows, :]
        test_x = x[train_rows:, :]
        test_y = y[train_rows:, :]
    
    return x, y, train_x, train_y, test_x, test_y

training_pct = 0.7
x, y, train_x, train_y, test_x, test_y = split_data(data, pct_train = training_pct, 
                                x_cols = range(16), y_cols = [16], shuffle_data = True)
print train_x.shape
print train_y.shape
print test_x.shape

(14000L, 16L)
(14000L, 1L)
(6000L, 16L)


## Decision Trees

In [25]:
clf = tree.DecisionTreeClassifier(min_samples_leaf=3)
clf = clf.fit(train_x, train_y)
y2 = np.reshape(y, -1)
scores = model_selection.cross_val_score(clf, x, y2, cv=5)
print scores

[ 0.85290451  0.86744883  0.86031984  0.85725019  0.84653962]


In [34]:
dt = tree.DecisionTreeClassifier()
parameters = {'criterion': ('gini', 'entropy'), 'min_samples_leaf':[1, 3, 5]}
clf = model_selection.GridSearchCV(dt, parameters, cv=5)
clf.fit(x, y2)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'criterion': ('gini', 'entropy'), 'min_samples_leaf': [1, 3, 5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [35]:
print pd.DataFrame(clf.cv_results_)

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0         0.0882           0.0016          0.87250          1.000000   
1         0.0872           0.0014          0.85645          0.939862   
2         0.0838           0.0020          0.84005          0.907512   
3         0.0964           0.0014          0.88090          1.000000   
4         0.0910           0.0018          0.86115          0.945126   
5         0.0786           0.0012          0.84800          0.912525   

  param_criterion param_min_samples_leaf  \
0            gini                      1   
1            gini                      3   
2            gini                      5   
3         entropy                      1   
4         entropy                      3   
5         entropy                      5   

                                              params  rank_test_score  \
0    {u'criterion': u'gini', u'min_samples_leaf': 1}                2   
1    {u'criterion': u'gini', u'min_sampl

## kNN

In [38]:
nbrs = neighbors.KNeighborsClassifier()
parameters = {'n_neighbors':[1, 3, 5, 10], 'weights': ('uniform', 'distance')}
clf = model_selection.GridSearchCV(nbrs, parameters, cv=5)
clf.fit(x, y2)
print pd.DataFrame(clf.cv_results_)

   mean_fit_time  mean_score_time  mean_test_score  mean_train_score  \
0         0.0560           0.5444          0.95700          1.000000   
1         0.0492           0.5142          0.95700          1.000000   
2         0.0500           0.7322          0.95370          0.981600   
3         0.0490           0.7056          0.95855          1.000000   
4         0.0482           0.8616          0.95125          0.975087   
5         0.0504           0.9112          0.95670          1.000000   
6         0.0484           1.0814          0.94350          0.963350   
7         0.0506           1.0838          0.95280          1.000000   

  param_n_neighbors param_weights  \
0                 1       uniform   
1                 1      distance   
2                 3       uniform   
3                 3      distance   
4                 5       uniform   
5                 5      distance   
6                10       uniform   
7                10      distance   

                 