In [11]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import random
from tensorflow.keras.datasets import fashion_mnist
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [439]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [166]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class preparation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) / 255
        return X

<h2>Random Search</h2>

In [148]:
train_set = lgb.Dataset(data = X, label = y)

def objective(hyperparameters, iterations):
    if 'n_estimators' in hyperparameters.keys():
        del hyperparameters['n_estimators']
    cross_validation = lgb.cv(hyperparameters, train_set, num_boost_round = 10000,
                             nfold = 3, early_stopping_rounds = 80, metrics = 'auc', seed = 3)
    score = cross_validation['auc-mean'][-1]
    estimator = len(cross_validation['auc-mean'])
    hyperparameters['n_estimators'] = estimator
    
    return [score, hyperparameters, iterations]

parameters = {'boosting_type': ['goss', 'gbdt'],
             'num_leaves': list(np.arange(40, 200, 10)),
             'learning_rate': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
             'min_child_weight': list(np.logspace(np.log10(0.005), np.log10(0.5), base = 10, num = 1000)),
             'reg_alpha': list(np.linspace(0, 1)),
             'reg_lambda': list(np.linspace(0,1)),
             'subsample': list(np.linspace(0.5, 1, 100)),
            'colsample_bytree': list(np.linspace(0.6, 1, 10)),
              'is_unbalance': [True, False]}


random.seed(3)
def random_search(hyperparameters, out_file, iteration):
    results = pd.DataFrame(columns = ['Score', 'Hyperparameters', 'Iterations'],
                          index = list(range(iteration)))
    
    for i in range(iteration):
        param = {k: random.sample(v, 1)[0] for k, v in hyperparameters.items()}
        
        param['subsample'] = 1.0 if param['boosting_type'] == 'goss' else param['subsample']

        evaluations = objective(param, i)
        results.loc[i, :] = evaluations
        
        of_connection = open(out_file, 'a')
        writer = csv.writer(of_connection)
        writer.writerow(evaluations)
        of_connection.close()
        
    results.sort_values('Score', ascending = False, inplace = True)
    results.reset_index(inplace = True)
    return results

In [149]:
#results = random_search(parameters, out_file, 15)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170881
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 783
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170881
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 783
[LightGBM] [Info] Using GOSS
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170881
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 783
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 4.500000
[LightGBM] [Info] Start training from score 4.500000
[LightGBM] [Info] Start training from score 4.500000
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 170881
[LightGBM] [Info] Number of data points in the train set: 40000, number of used features: 783
[L

KeyboardInterrupt: 

In [183]:
grid = pd.read_csv('Randomized_Parameters.csv')

In [187]:
grid.sort_values(by = 'score', ascending = False, inplace = True)
grid.reset_index(inplace = True)

In [157]:
import pprint

In [194]:
for i in range(2):
    print(grid.loc[i, 'score'])
    pprint.pprint(grid.loc[i , 'hyperparameters'])

0.9089334537037036
("{'boosting_type': 'goss', 'num_leaves': 80, 'learning_rate': "
 "0.028557932390632156, 'min_child_samples': 60, 'reg_alpha': "
 "0.6122448979591836, 'reg_lambda': 0.8163265306122448, 'subsample': 1.0, "
 "'colsample_bytree': 0.6444444444444444, 'is_unbalance': True, "
 "'n_estimators': 1267}")
0.9082152962962964
("{'boosting_type': 'gbdt', 'num_leaves': 43, 'learning_rate': "
 "0.007432624224989285, 'subsample_for_bin': 120000, 'min_child_samples': 125, "
 "'reg_alpha': 0.9591836734693877, 'reg_lambda': 0.8571428571428571, "
 "'colsample_bytree': 0.7777777777777778, 'subsample': 0.6616161616161617, "
 "'is_unbalance': True, 'n_estimators': 6943}")


<h3>Sample 1</h3>

In [232]:
sample_1 = Pipeline([('prep', preparation()),
                           ('pca', PCA(n_components = 300)),
                           ('lgb', lgb.LGBMClassifier(boosting_type = 'gbdt', num_leaves = 43, 
                                                      learning_rate = 0.007432624224989285, 
                                                      subsample_for_bin = 120000, 
                                                      min_child_samples = 125, 
                                                      reg_alpha = 0.9591836734693877, 
                                                      reg_lambda = 0.8571428571428571, 
                                                      colsample_bytree = 0.7777777777777778, 
                                                      subsample = 0.6616161616161617, 
                                                      is_unbalance = True, 
                                                      n_estimators = 6943))])

In [233]:
sample_1.fit(X_train, y_train)
prediction = sample_1.predict(X_test)
print(accuracy_score(prediction, y_test))

0.8866


<h3>Sample 2</h3>

In [437]:
sample_2 = Pipeline([('prep', preparation()),
                           #('pca', PCA(n_components = 300)),
                        ('lgb', lgb.LGBMClassifier(boosting_type = 'gbdt', num_leaves = 70, 
                                                      learning_rate = 0.049884888211816,
                                                      min_child_samples = 20, 
                                                      reg_alpha = 0.4897959183673469, 
                                                      reg_lambda = 0.5510204081632653, 
                                                      colsample_bytree = 0.5, 
                                                      subsample = 0.8888888888888, 
                                                      is_unbalance = False,
                                                      n_estimators = 1477))])

In [440]:
sample_2.fit(X_train, y_train)
prediction = sample_2.predict(X_test)
print(accuracy_score(prediction, y_test))

0.9064


<h3>Sample 3</h3>

In [441]:
sample_3 = Pipeline([('prep', preparation()),
                           #('pca', PCA(n_components = 300)),
                           ('lgb', lgb.LGBMClassifier(boosting_type = 'dart', num_leaves = 140, 
                                                      learning_rate = 0.342614579764203,
                                                      min_child_samples = 60, 
                                                      reg_alpha = 0.9183673469387754, 
                                                      reg_lambda = 0.6530612244897959, 
                                                      colsample_bytree = 0.955555555, 
                                                      subsample = 0.7727272727272727, 
                                                      is_unbalance = True, 
                                                      n_estimators = 3500))])

In [442]:
sample_3.fit(X_train, y_train)
prediction = sample_3.predict(X_test)
print(accuracy_score(prediction, y_test))

0.9025


<h2> Self Add Noise </h2>

<h3>Preparation</h2>

In [401]:
from scipy import ndimage, misc
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [402]:
X = X_train.copy()
X_val = X_test.copy()

In [403]:
X_transform = X.reshape(X.shape[0], -1 )

<h4>For Train Set</h4>

In [404]:
train_set = pd.DataFrame(X_transform)
train_label = pd.DataFrame({'Label': y_train})
train_full = pd.concat([train_set, train_label], axis = 1)

In [405]:
sample_0 = train_full[train_full['Label'] == 0].sample(750)
sample_1 = train_full[train_full['Label'] == 1].sample(750)
sample_2 = train_full[train_full['Label'] == 2].sample(750)
sample_3 = train_full[train_full['Label'] == 3].sample(750)
sample_4 = train_full[train_full['Label'] == 4].sample(750)
sample_5 = train_full[train_full['Label'] == 5].sample(750)
sample_6 = train_full[train_full['Label'] == 6].sample(750)
sample_7 = train_full[train_full['Label'] == 7].sample(750)
sample_8 = train_full[train_full['Label'] == 8].sample(750)
sample_9 = train_full[train_full['Label'] == 9].sample(750)

In [406]:
def rotate(x):
    rotation = pd.DataFrame([])
    for i in range(250):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 90).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    for i in range(250, 500):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 180).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    for i in range(500, 750):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 270).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    return rotation

In [407]:
t_0 = rotate(sample_0)
t_0['Label'] = 0

t_1 = rotate(sample_1)
t_1['Label'] = 1

t_2 = rotate(sample_2)
t_2['Label'] = 2

t_3 = rotate(sample_3)
t_3['Label'] = 3

t_4 = rotate(sample_4)
t_4['Label'] = 4

t_5 = rotate(sample_5)
t_5['Label'] = 5

t_6 = rotate(sample_6)
t_6['Label'] = 6

t_7 = rotate(sample_7)
t_7['Label'] = 7

t_8 = rotate(sample_8)
t_8['Label'] = 8

t_9 = rotate(sample_9)
t_9['Label'] = 9

In [408]:
rotate_train = pd.concat([t_0, t_1, t_2, t_3, t_4, t_5, t_6, t_7, t_8, t_9], axis = 0)

In [409]:
train_set = np.insert(X, -1, values = rotate_train.iloc[:, :784].values.reshape(-1, 28, 28), axis = 0)
y = np.insert(y_train, -1, rotate_train['Label'].values, axis = 0)
train_set.shape, y.shape

((67500, 28, 28), (67500,))

<h4>For Test Set</h4>

In [410]:
X_val_transform = X_val.reshape(X_val.shape[0], -1 )
X_val_transform.shape

(10000, 784)

In [411]:
test_set = pd.DataFrame(X_val_transform)
test_label = pd.DataFrame({'Label': y_test})
test_full = pd.concat([test_set, test_label], axis = 1)

In [412]:
test_0 = test_full[test_full['Label'] == 0].sample(150)
test_1 = test_full[test_full['Label'] == 1].sample(150)
test_2 = test_full[test_full['Label'] == 2].sample(150)
test_3 = test_full[test_full['Label'] == 3].sample(150)
test_4 = test_full[test_full['Label'] == 4].sample(150)
test_5 = test_full[test_full['Label'] == 5].sample(150)
test_6 = test_full[test_full['Label'] == 6].sample(150)
test_7 = test_full[test_full['Label'] == 7].sample(150)
test_8 = test_full[test_full['Label'] == 8].sample(150)
test_9 = test_full[test_full['Label'] == 9].sample(150)

In [413]:
def rotate_test(x):
    rotation = pd.DataFrame([])
    for i in range(50):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 90).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    for i in range(50, 100):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 180).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    for i in range(100, 150):
        img = x.iloc[i, :784].values.reshape(28, 28)
        rot = ndimage.rotate(img, 270).reshape(1, -1)
        r = pd.DataFrame(rot,  index = [0])
        rotation = rotation.append(r, ignore_index = True)
    return rotation

In [414]:
te_0 = rotate_test(test_0)
te_0['Label'] = 0

te_1 = rotate_test(test_1)
te_1['Label'] = 1

te_2 = rotate_test(test_2)
te_2['Label'] = 2

te_3 = rotate_test(test_3)
te_3['Label'] = 3

te_4 = rotate_test(test_4)
te_4['Label'] = 4

te_5 = rotate_test(test_5)
te_5['Label'] = 5

te_6 = rotate_test(test_6)
te_6['Label'] = 6

te_7 = rotate_test(test_7)
te_7['Label'] = 7

te_8 = rotate_test(test_8)
te_8['Label'] = 8

te_9 = rotate_test(test_9)
te_9['Label'] = 9

In [415]:
rotate_test = pd.concat([te_0, te_1, te_2, te_3, te_4, te_5, te_6, te_7, te_8, te_9], axis = 0)

In [416]:
test_set = np.insert(X_val, -1, values = rotate_test.iloc[:, :784].values.reshape(-1, 28, 28), axis = 0)
y_test = np.insert(y_test, -1, rotate_test['Label'].values, axis = 0)
test_set.shape, y_test.shape

((11500, 28, 28), (11500,))

<h3>For real</h3>

In [424]:
pipeline = Pipeline([('prep', preparation()),
                           ('pca', PCA(n_components = 350)),
                           ('lgb', lgb.LGBMClassifier(boosting_type = 'gbdt', num_leaves = 70, 
                                                      learning_rate = 0.049884888211816,
                                                      min_child_samples = 20, 
                                                      reg_alpha = 0.4897959183673469, 
                                                      reg_lambda = 0.5510204081632653, 
                                                      colsample_bytree = 0.5, 
                                                      subsample = 0.8888888888888, 
                                                      is_unbalance = False,
                                                      n_estimators = 1477))])

In [425]:
pipeline.fit(train_set, y)
prediction = pipeline.predict(test_set)
print(accuracy_score(prediction, y_test))

0.8695652173913043


<h2>Putting everything together</h2>

In [24]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

class preparation(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y = None):
        return self
    def transform(self, X):
        X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) / 255
        return X

In [30]:
pipeline = Pipeline([
                ('prep', preparation()),
                #('pca', PCA(n_components = 700)),
                ('lgb', lgb.LGBMClassifier(boosting_type = 'gbdt', num_leaves = 70, 
                                                      learning_rate = 0.049884888211816,
                                                      min_child_samples = 20, 
                                                      reg_alpha = 0.4897959183673469, 
                                                      reg_lambda = 0.5510204081632653, 
                                                      colsample_bytree = 0.5, 
                                                      subsample = 0.8888888888888, 
                                                      is_unbalance = False,
                                                      n_estimators = 1477))])

In [27]:
X = np.concatenate((X_train, X_test), axis = 0)
y = np.concatenate((y_train, y_test), axis = 0)

In [28]:
pipeline.fit(X, y)

Pipeline(steps=[('prep', preparation()), ('pca', PCA(n_components=700)),
                ('lgb',
                 LGBMClassifier(colsample_bytree=0.5, is_unbalance=False,
                                learning_rate=0.049884888211816,
                                n_estimators=1477, num_leaves=70,
                                reg_alpha=0.4897959183673469,
                                reg_lambda=0.5510204081632653,
                                subsample=0.8888888888888))])

In [29]:
import joblib
joblib.dump(pipeline, "Nick.pkl")

['Nick.pkl']