# Projekt ML - Michał Kowalik

In [252]:
# Baseline dla zbioru CIFAR-10 - regresja logistyczna w wersji multiclass
# Przy submitowaniu predykcji proszę używać funkcji save_labels

import os
import pickle

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold

import tqdm
os.environ['KERAS_BACKEND'] = 'theano'
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten, Input, UpSampling2D
from keras.optimizers import SGD
from keras.models import Model

from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras.constraints import maxnorm

Ładujemy dane, przy okazji przekształcając je do postaci lubianej przez keras.

In [242]:
def save_labels(arr, filename):
    pd_array = pd.DataFrame(arr)
    pd_array.index.names = ["Id"]
    pd_array.columns = ["Prediction"]
    pd_array.to_csv(filename)

def load_labels(filename):
    return pd.read_csv(filename, index_col=0).values.ravel()

X_train = np.load("X_train.npy")
y_train = load_labels("y_train.csv")
X_test = np.load("X_test.npy")

X_train_small = np.load("X_train_small.npy")
y_train_small = load_labels("y_train_small.csv")

y_train_one_hot = keras.utils.to_categorical(y_train)
y_train_small_one_hot = keras.utils.to_categorical(y_train_small)


Przekształcamy na float i skalujemy

In [243]:
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0
X_train_small = X_train_small.astype('float32') / 255.0

print X_train.shape
print X_test.shape
print X_train_small.shape

(50000L, 3072L)
(10000L, 3072L)
(5000L, 3072L)


In [244]:
print y_train_small.max()
print y_train_small.min()
classes_number = y_train_small.max() - y_train_small.min() + 1
print classes_number

9
0
10


Następnie dzielimy na dane trenujące i testujące. Moglibyśmy użyć cross-entropy, ale bardzo by to wydłużyło cały proces doboru parametrów. Chcemy tylko mniej-więcej wybrać parametry, a następnie i tak je będziemy testować na całym zbiorze trenującym już przy użyciu cross-entropy.

In [245]:
X_tr_s, X_te_s, y_tr_s, y_te_s = train_test_split(X_train_small, y_train_small_one_hot, test_size=0.25)

In [48]:
# Najpierw zobaczymy jak sobie radzi na prostej sieci 3-warstwowej na malych danych

for lr in [0.01, 0.05, 0.1, 0.2]:
    for activation_first in ['relu', 'softmax']:
        for activation_second in ['softmax', 'sigmoid']:
            for hidden_size in [10, 50, 100, 250, 500, 1000]:

                batch = 1000

                model = Sequential()
                model.add(Dense(hidden_size, input_shape=(X_train_small.shape[1],)))
                model.add(Activation(activation_first))
                model.add(Dense(classes_number, input_shape=(hidden_size, )))
                model.add(Activation(activation_second))

                model.compile(optimizer=SGD(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])

                print "Testing lr:", lr, "activations:", activation_first, ",", activation_second, \
                    "hidden layer size:", hidden_size

                model.fit(X_tr_s, y_tr_s, epochs=50, batch_size=batch, verbose=0)

                y_pred = model.predict(X_te_s, batch_size=batch)

                print "Score: ", accuracy_score(y_te_s.argmax(axis=1), y_pred.argmax(axis=1))

    

Testing lr: 0.01 activations: relu , softmax hidden layer size: 10
Score:  0.2928
Testing lr: 0.01 activations: relu , softmax hidden layer size: 50
Score:  0.3472
Testing lr: 0.01 activations: relu , softmax hidden layer size: 100
Score:  0.38
Testing lr: 0.01 activations: relu , softmax hidden layer size: 250
Score:  0.3848
Testing lr: 0.01 activations: relu , softmax hidden layer size: 500
Score:  0.3848
Testing lr: 0.01 activations: relu , softmax hidden layer size: 1000
Score:  0.3992
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 10


INFO (theano.gof.compilelock): Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock
INFO:theano.gof.compilelock:Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock


Score:  0.2592
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 50
Score:  0.3216
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 100
Score:  0.32
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 250
Score:  0.348
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 500
Score:  0.3496
Testing lr: 0.01 activations: relu , sigmoid hidden layer size: 1000
Score:  0.3952
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 10
Score:  0.216
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 50
Score:  0.16
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 100
Score:  0.1552
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 250
Score:  0.0896
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 500
Score:  0.0944
Testing lr: 0.01 activations: softmax , softmax hidden layer size: 1000
Score:  0.0952
Testing lr: 0.01 activations: softmax , sigmoid hidden layer 

Widzimy, ze siec osiaga najlepsze wyniki dla lr=0.05, funkcji aktywacji relu, sigmoid i wiekosci warsty ukrytej na poziomie 500-1000. Zatem sprobujmy wytrenowac siec na dla calych danych.

In [173]:
lr = 0.05
activation_first = 'relu'
activation_second = 'sigmoid'
hidden_sizes = [500, 1000, 1500, 2000]

for hidden_size in hidden_sizes:
    
    n_folds = 3
    skf = StratifiedKFold(n_splits=3, shuffle=True)
    
    print "Testing lr:", lr, "activations:", activation_first, ",", activation_second, \
        "hidden layer size:", hidden_size
        
    scores = []

    for i, (train, test) in enumerate(skf.split(X_tr, y_train)):
        print "Running Fold", i+1, "/", n_folds

        batch = 1000

        model = Sequential()
        model.add(Dense(hidden_size, input_shape=(X_tr.shape[1],)))
        model.add(Activation(activation_first))
        model.add(Dense(classes_number, input_shape=(hidden_size, )))
        model.add(Activation(activation_second))

        model.compile(optimizer=SGD(lr=lr), loss='categorical_crossentropy', metrics=['accuracy'])
    
        model.fit(X_tr[train], y_train_one_hot[train], epochs=50, batch_size=batch, verbose=0)
        
        y_pred = model.predict(X_tr[test], batch_size=batch)
        
        score = accuracy_score(y_train_one_hot[test].argmax(axis=1), y_pred.argmax(axis=1))
        print "Score: ", score
        scores.append(score)

    print "Score mean: ", np.mean(scores)
    print 



Testing lr: 0.05 activations: relu , sigmoid hidden layer size: 500
Running Fold 1 / 3
Score:  0.50575884823
Running Fold 2 / 3
Score:  0.456328734253
Running Fold 3 / 3
Score:  0.467647058824
Score mean:  0.476578213769

Testing lr: 0.05 activations: relu , sigmoid hidden layer size: 1000
Running Fold 1 / 3
Score:  0.472765446911
Running Fold 2 / 3
Score:  0.453929214157
Running Fold 3 / 3
Score:  0.476770708283
Score mean:  0.467821789784

Testing lr: 0.05 activations: relu , sigmoid hidden layer size: 1500
Running Fold 1 / 3


INFO (theano.gof.compilelock): Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock
INFO:theano.gof.compilelock:Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock


Score:  0.511937612478
Running Fold 2 / 3
Score:  0.495440911818
Running Fold 3 / 3
Score:  0.502040816327
Score mean:  0.503139780207

Testing lr: 0.05 activations: relu , sigmoid hidden layer size: 2000
Running Fold 1 / 3


INFO (theano.gof.compilelock): Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock
INFO:theano.gof.compilelock:Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock


Score:  0.490161967606
Running Fold 2 / 3
Score:  0.478764247151
Running Fold 3 / 3
Score:  0.506842737095
Score mean:  0.491922983951



Widzimy zatem, że na prostej sieci newuronowej z jedną warstwą ukrytą, na cross-entropy estymator z 3-ma foldami na zbiorze Train osiąga wynik na poziomie 50% accuracy, przy learning rate 0.05, funkcji aktywacji pierwszej warstwy: relu i funkcji aktywacji drugiej warstwy: sigmoid i rozmiarze ukrytej warstwy na poziomie 1500.

Moglibyśmy jeszcze dobrać optymalną liczbę epok uczenia, jednak zapewne wynik uda nam się podbić nieznacznie, zatem spróbujemy użyć innej metody, jaką jest prosta sieć konwolucyjna.

In [None]:
(X_train_cifar, y_train_cifar), (X_test_cifar, y_test_cifar) = cifar10.load_data()

X_train_cifar = X_train_cifar.astype('float32')
X_test_cifar = X_test_cifar.astype('float32')
X_train_cifar = X_train_cifar / 255.0
X_test_cifar = X_test_cifar / 255.0

y_train_cifar = np_utils.to_categorical(y_train_cifar)
y_test_cifar = np_utils.to_categorical(y_test_cifar)
num_classes = y_test_cifar.shape[1]
                                 
print X_train_cifar

print "BLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLABLA"
print pack_color_image(X_train).reshape(-1, 32, 32, 3)

In [246]:
def pack_color_image(tab):
    tab_red = tab[:,:1024]
    tab_green = tab[:,1024:2048]
    tab_blue = tab[:,2048:3072]

    ret = np.dstack((tab_red, tab_green, tab_blue))

    return ret

In [247]:
X_tr_s_reshaped = pack_color_image(X_tr_s).reshape(-1, 32, 32, 3)
X_te_s_reshaped = pack_color_image(X_te_s).reshape(-1, 32, 32, 3)
print X_tr_s_reshaped.shape
print X_te_s_reshaped.shape

print y_tr_s.shape
print y_te_s.shape

num_classes = y_te_s.shape[1]
print num_classes

(3750L, 32L, 32L, 3L)
(1250L, 32L, 32L, 3L)
(3750L, 10L)
(1250L, 10L)
10


In [257]:
model = Sequential()
model.add(Conv2D(32, (5, 5), input_shape=(32, 32, 3), padding='same', activation='relu', kernel_constraint=maxnorm(3)))#, kernel_constraint=maxnorm(3)))
model.add(Dropout(0.2))
model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))#, kernel_constraint=maxnorm(3)))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Flatten())
model.add(Dense(512, activation='relu'))#, kernel_constraint=maxnorm(3)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
# Compile model
epochs = 10
lrate = 0.01
decay = lrate/epochs
sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
print(model.summary())

INFO (theano.gof.compilelock): Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock
INFO:theano.gof.compilelock:Refreshing lock C:\Users\Kowalik\AppData\Local\Theano\compiledir_Windows-10-10.0.15063-Intel64_Family_6_Model_69_Stepping_1_GenuineIntel-2.7.13-64\lock_dir\lock


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_74 (Conv2D)           (None, 32, 32, 32)        2432      
_________________________________________________________________
dropout_32 (Dropout)         (None, 32, 32, 32)        0         
_________________________________________________________________
conv2d_75 (Conv2D)           (None, 32, 32, 32)        9248      
_________________________________________________________________
max_pooling2d_44 (MaxPooling (None, 16, 16, 32)        0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 8192)              0         
_________________________________________________________________
dense_346 (Dense)            (None, 512)               4194816   
_________________________________________________________________
dropout_33 (Dropout)         (None, 512)               0         
__________

In [258]:
model.fit(X_tr_s_reshaped, y_tr_s, validation_data=(X_te_s_reshaped, y_te_s), epochs=epochs, batch_size=32)
scores = model.evaluate(X_test, y_test)

Train on 3750 samples, validate on 1250 samples
Epoch 1/10
Epoch 2/10
 416/3750 [==>...........................] - ETA: 25s - loss: 2.1769 - acc: 0.12 - ETA: 25s - loss: 2.2460 - acc: 0.14 - ETA: 24s - loss: 2.2041 - acc: 0.17 - ETA: 24s - loss: 2.1866 - acc: 0.17 - ETA: 24s - loss: 2.2125 - acc: 0.17 - ETA: 23s - loss: 2.1861 - acc: 0.18 - ETA: 23s - loss: 2.1547 - acc: 0.21 - ETA: 23s - loss: 2.1531 - acc: 0.21 - ETA: 22s - loss: 2.1534 - acc: 0.21 - ETA: 22s - loss: 2.1491 - acc: 0.20 - ETA: 22s - loss: 2.1378 - acc: 0.21 - ETA: 22s - loss: 2.1366 - acc: 0.20 - ETA: 22s - loss: 2.1191 - acc: 0.2139

KeyboardInterrupt: 

In [198]:
print scores

[0.49016196760647868, 0.47876424715056987, 0.50684273709483796]


In [203]:
y_pred = model.predict(X_te_s_reshaped)

In [205]:
score = accuracy_score(y_te_s.argmax(axis=1), y_pred.argmax(axis=1))
print "Score: ", score

Score:  0.5088


Spróbujmy przetestować jeszcze jeden model, którym będzie RandomForest. Najpierw, tak samo, zrobimy evaluację na małych danych, a później nauczymy model na pełnych danych trenujących.

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
n_estimators

rf = R