# Classify Images of CIFAR-10 dataset

In [1]:
import pandas as pd

In [2]:
# unload images
def unpickle(file):
    import cPickle
    fo = open(file, 'rb')
    dict = cPickle.load(fo)
    fo.close()
    return dict

In [3]:
batch_1 = unpickle("cifar-10-batches-py/data_batch_1")
batch_2 = unpickle("cifar-10-batches-py/data_batch_2")
batch_3 = unpickle("cifar-10-batches-py/data_batch_3")
batch_4 = unpickle("cifar-10-batches-py/data_batch_4")
batch_5 = unpickle("cifar-10-batches-py/data_batch_5")


In [4]:
df1 = pd.DataFrame(batch_1['data'])
df1['label'] = pd.DataFrame(batch_1['labels'])

df2 = pd.DataFrame(batch_2['data'])
df2['label'] = pd.DataFrame(batch_2['labels'])

df3 = pd.DataFrame(batch_3['data'])
df3['label'] = pd.DataFrame(batch_3['labels'])

df4 = pd.DataFrame(batch_4['data'])
df4['label'] = pd.DataFrame(batch_4['labels'])

df5 = pd.DataFrame(batch_5['data'])
df5['label'] = pd.DataFrame(batch_5['labels'])

df = pd.concat([df1, df2, df3, df4, df5])
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,label
0,59,43,50,68,98,119,139,145,149,149,...,58,65,59,46,57,104,140,84,72,6
1,154,126,105,102,125,155,172,180,142,111,...,42,67,101,122,133,136,139,142,144,9
2,255,253,253,253,253,253,253,253,253,253,...,83,80,69,66,72,79,83,83,84,9
3,28,37,38,42,44,40,40,24,32,43,...,39,59,42,44,48,38,28,37,46,4
4,170,168,177,183,181,177,181,184,189,189,...,88,85,82,83,79,78,82,78,80,1


__data__ -- a 10000x3072 numpy array of uint8s. Each row of the array stores a 32x32 colour image. The first 1024 entries contain the red channel values, the next 1024 the green, and the final 1024 the blue. The image is stored in row-major order, so that the first 32 entries of the array are the red channel values of the first row of the image.

__labels__ -- a list of 10000 numbers in the range 0-9. The number at index i indicates the label of the ith image in the array data.

The dataset contains another file, called batches.meta. It too contains a Python dictionary object. It has the following entries:
label_names -- a 10-element list which gives meaningful names to the numeric labels in the labels array described above. For example, label_names[0] == "airplane", label_names[1] == "automobile", etc.

In [5]:
label_map = unpickle("cifar-10-batches-py/batches.meta")
label_map = label_map["label_names"]
label_map

['airplane',
 'automobile',
 'bird',
 'cat',
 'deer',
 'dog',
 'frog',
 'horse',
 'ship',
 'truck']

In [6]:
label_map = dict(zip(range(0,10), label_map))

In [7]:
label_map[0]

'airplane'

In [8]:
df_label_names = map(lambda(x): label_map[x], df.label)

In [9]:
df.head()
labels = df['label']
del df['label']

In [10]:
df[0:1000].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3062,3063,3064,3065,3066,3067,3068,3069,3070,3071
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,133.467,132.764,133.703,134.171,135.394,136.677,136.762,136.179,136.104,136.528,...,113.163,113.195,113.293,113.769,113.87,114.006,114.771,114.869,114.781,115.33
std,72.523076,71.017743,71.069978,70.31519,69.95642,69.987734,70.097325,70.593728,70.644376,70.771405,...,63.91588,63.979359,63.507325,63.657175,63.394921,63.21283,63.526122,64.014652,64.050242,64.946294
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,73.75,74.0,75.0,76.0,81.0,78.75,80.75,80.0,79.75,79.0,...,64.0,64.0,64.0,64.0,64.0,65.0,66.0,66.0,68.0,67.0
50%,131.0,128.0,131.5,132.0,132.0,136.0,135.0,133.5,135.0,137.0,...,105.0,106.0,105.0,106.0,108.0,108.0,108.0,106.5,107.0,107.5
75%,191.0,188.0,190.0,188.0,190.0,192.0,193.0,192.0,193.25,194.0,...,156.0,155.0,157.0,157.0,156.25,157.25,157.0,160.0,161.0,160.0
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,...,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [None]:
labels.values

In [11]:
from sklearn.cross_validation import train_test_split

In [12]:
Xtr, Xte, Ytr, Yte = train_test_split(df[0:1000].values, labels[0:1000].values, test_size=0.20, random_state=1)

In [13]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import cross_val_score

In [15]:
knn = KNeighborsClassifier()
cross_val_score(knn, Xtr, Ytr, cv=10).mean()

0.21486661620315384

In [16]:
knn.fit(Xtr, Ytr)
knn.score(Xte, Yte)

0.19500000000000001