## Подгружаем все клевые библиотеки

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
plt.style.use('bmh')

А также кидаем seed=0 в обе ГСЧ-машины

In [2]:
import random
random.seed(0)
np.random.seed(0)

## Загружаем данные 
(это просто 784 колонки со значениями пикселов)

In [3]:
train  = pd.read_csv('../cleandata/train.csv')
test   = pd.read_csv('../cleandata/test.csv')
target = pd.read_csv('../cleandata/target.csv')

In [4]:
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Не пугаемся, что тут нули, просто колонки плохие. Посмотрим в хороших колонках

In [5]:
train[train.columns[200:300]].head(2)

Unnamed: 0,200,201,202,203,204,205,206,207,208,209,...,290,291,292,293,294,295,296,297,298,299
0,0,0,0,0,92,253,255,253,253,253,...,253,27,19,28,47,84,37,84,84,177
1,0,0,0,0,0,0,0,2,128,253,...,144,252,253,167,9,0,0,0,118,253


## Объединяем train и test в один датасет. 
Просто так это сделать нельзя, потому что в обеих таблицах есть нулевая строчка. Поэтому перенумеруем номера строчек в test, чтобы они не пересекалесь с номерами строчек в train:

In [6]:
TESTOFFSET = train.shape[0]
TESTOFFSET

33600

In [7]:
test.index += TESTOFFSET
test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
33600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
33602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Теперь индексы(==номера строчек) в test начинаются там, где закончились индексы из train. Можно смело объединять

In [8]:
data = pd.concat((train, test))

In [9]:
data.shape

(42000, 784)

In [10]:
8400+33600

42000

Ура, у нас есть один большой массив данных

## Уменьшаем dimensionality.
Заменяем каждую пару пикселей на их среднее: то есть уменьшаем число пикселей вдвое (например, среднее значение pixel6 и pixel7 записывается в pixel6, а pixel7 удаляется). Разрешение как бы становится 14x28. В два раза меньше столбцов в датасете, а информации не сильно меньше.

In [11]:
for i in range(0,784,2):  #то есть берем с 0 до 783 каждый второй
    data[str(i)] = ( data[str(i)] + data[str(i+1)] )/2
    data.drop(str(i+1), axis='columns', inplace=True)

NB: Это работает долго, потому что **pandas.DataFrame** сам по себе очень медленный. Аналогичная операция с **numpy.array** прошла бы намного быстре

In [12]:
data.head(2)

Unnamed: 0,0,2,4,6,8,10,12,14,16,18,...,764,766,768,770,772,774,776,778,780,782
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь удалим еще те пиксели, которые всегда черные. Такие столбцы в себе никакой информации не несут, одни нули независимо от образца. Найдем их:

In [13]:
nullcols = []
for c in data.columns:
    if data[c].sum() == 0:          
        nullcols.append(c)
print(nullcols)

['0', '2', '4', '6', '8', '10', '16', '18', '20', '22', '24', '26', '28', '30', '52', '54', '56', '82', '84', '140', '420', '644', '672', '700', '728', '730', '754', '756', '758', '780', '782']


In [14]:
len(nullcols)

31

Целых 31! Удаляем:

In [15]:
data.drop(nullcols, axis='columns', inplace=True)

In [16]:
data.shape

(42000, 361)

Осталось 361 переменная.

## Скейлим
Обычно в таких случаях я пользуюсь StandardScaler, который центрирует вокруг нуля и масштабирует, пока дисперсия не станет 1. Но в данном случае приятней будет смотреться MinMaxScaler, который укладывает все значения в промежуток от 0 до 1

In [17]:
from sklearn.preprocessing import MinMaxScaler

In [18]:
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data))

## Снова разделяем на train и test

In [19]:
test  = data[data.index >=TESTOFFSET]
train = data[data.index < TESTOFFSET]
test.index -= TESTOFFSET

In [20]:
test.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,351,352,353,354,355,356,357,358,359,360
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,351,352,353,354,355,356,357,358,359,360
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Там на самом деле не одни нули. Выведем хорошие колонки

In [22]:
train[train.columns[train.iloc[0] > 0]].head(3)

Unnamed: 0,82,83,84,85,86,95,96,97,98,99,...,317,318,319,320,330,331,332,342,343,344
0,0.676471,0.996078,0.992157,0.719608,0.394118,0.037255,0.878431,0.990196,0.988235,0.990196,...,0.062745,0.929412,0.97451,0.170588,0.27451,0.990196,0.533333,0.27451,0.896078,0.04902
1,0.0,0.003922,0.747059,0.562745,0.0,0.0,0.0,0.078431,0.992157,0.207843,...,0.0,0.24902,0.015686,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.75098,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.37451,0.87451,1.0,0.87451,0.0,0.0,0.0,0.0,0.0,0.0


## Выделяем из train кусок в 20% для валидации

In [23]:
from sklearn.cross_validation import train_test_split

In [24]:
Xtr, Xval, Ytr, Yval = train_test_split(train, target['digits_values'], stratify=target['digits_values'], 
                                        test_size=0.2, random_state=0)

In [25]:
(Xtr, Xval) = (Xtr.as_matrix(), Xval.as_matrix())

## Здесь начинаются нейронки
Импортируем прекрасную вещь **keras** (верней, только то из нее, что нам нужно)

In [26]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers.advanced_activations import PReLU

Using Theano backend.
ERROR (theano.sandbox.gpuarray): pygpu was configured but could not be imported
NoneType


Он пишет ошибку, что у меня проблемы с GPU и придется считать на CPU. :(

Сейчас наши Ytr и Yval - это столбцы с цифрами:

In [27]:
Ytr.head()

5940     6
29140    0
29953    6
2786     0
11557    1
Name: digits_values, dtype: int64

Нам надо сделать из них 10 столбцов с классами (то, что называется one-hot encoding)

In [28]:
from keras.utils.np_utils import to_categorical
Ytr_m = to_categorical(Ytr.as_matrix(), nb_classes=10)
Yval_m = to_categorical(Yval.as_matrix(), nb_classes=10)

In [29]:
Ytr_m[:5,:]

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]])

Посмотрим на этот вывод и на предыдущий и поймем, что это одно и то же 

(например, в первой строчке единица в шестом столбце - это цифра 6)

## Создаем нейронку

In [30]:
# нейронка будет обычная, из последовательных слоев
model = Sequential()

# добавляем первый слой, 256 нейронов, количество входных переменных в него равно количеству столбцов в Xtr, 
# инициализация начальных значений - нормальное распределение
model.add(Dense(256, input_dim=Xtr.shape[1], init='normal'))

# добавляем активацию PReLU (это как ReLU, но еще лучше, на Кэггле все так делают)
model.add(PReLU())

# добавляем Dropout 0.3 - то есть на каждом батче из рассмотрения исключаются 30% нейронов на этом слое
model.add(Dropout(0.3))

# еще 128 нейронов
model.add(Dense(128, init='normal'))

# еще PReLU
model.add(PReLU())

# еще Dropout
model.add(Dropout(0.1))

# последний слой, его активация - softmax, иначе (c ReLU) он не сможет предсказывать вероятности
model.add(Dense(10, activation='softmax', init='normal'))

# все слои добавлены, собираем нейронку. задаем ей функцию, которую нужно оптимизировать --
# categorical crossentropy, он же multiclass logloss, а так же дополнительную метрику (уже не для него, а для нас) 
# оптимизатор - adadelta, про нее было на лекции
model.compile(optimizer='adadelta', loss='categorical_crossentropy', metrics=['accuracy'])

## Обучаем нейронку
20 эпох и валидация для нашего удобства - будем видеть, как нейронка показывает себя на валидационном датасете

In [31]:
model.fit(Xtr, Ytr_m, nb_epoch=20, validation_data=(Xval, Yval_m))

Train on 26881 samples, validate on 6719 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f8bd3c596d8>

## Ееее рок!
Сразу две хорошие новости

**1**. Мы сделали настоящую нейронку.
&nbsp;

**2**. Точность на валидации 0.9768, поэтому ожидаем неплохой точности и на лидерборде

In [32]:
preds = model.predict_classes(test.as_matrix())



In [33]:
submission=pd.DataFrame(data={'id':list(test.index), 'digit':preds})
submission=submission[['id', 'digit']]
submission.head(4)

Unnamed: 0,id,digit
0,0,1
1,1,0
2,2,0
3,3,5


In [34]:
submission.to_csv('../subs/sub01-neuro.csv', index=False)