### Домашнее задание "Методы оптимизации"
Преподаватель: Алексей Кузьмин
1. Прочитать про методы оптимизации для нейронных сетей https://habr.com/post/318970/
2. Реализовать самостоятельно логистическую регрессию
3. Обучить ее методом:
    - градиентного спуска
    - nesterov momentum
    - rmsprop
4. В качестве dataset’а взять Iris, оставив 2 класса:
    - Iris Versicolor
    - Iris Virginica

In [1]:
import numpy as np
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

Загружаем датасет

In [2]:
iris = load_iris()

In [3]:
# load data
X = iris.data[iris.target != 0]
y = iris.target[iris.target != 0]

Шкалируем X, переводим классы целевой переменной в бинарные.

In [4]:
# prepare our data
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
y[y==2] = 0

### Напишем простую лог-регрессию, обучающуюся методом градиентного спуска.

Обернем все это в класс с привычными методами.

In [5]:
class LogReg_Gradient(object):
    
    def __init__(self, epochs=100, learning_rate=0.0001):
        self.epochs = epochs
        self.learning_rate = learning_rate
    
    def fit(self, X, y, **kwargs):
        # fitting new params
        if len(kwargs) !=0:
            try:
                if len(kwargs['epochs']) !=0:
                    self.epochs = kwargs['epochs']
            except:
                pass
            
            try:
                if len(kwargs['learning_rate']) !=0:
                    self.learning_rate = kwargs['learning_rate']
            except:
                pass

        # set random params
        params = np.random.normal(size=len(X[0])+1)
        for _ in range(self.epochs):
            # make predictions
            pred = []
            for each in X:
                pred.append(1* params[0] +
                            each[0] * params[1] +
                            each[1] * params[2] + 
                            each[2] * params[3] + 
                            each[3] * params[4])

            # y predicted
            y_pred = []  
            for each in pred:
                y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))

            # updating params
            for param in range(len(params)):
                new_param = []
                for i in range(len(X)):
                    if param == 0:
                        new_param.append(1 * (y[i] - y_pred[i]))
                    else:
                        new_param.append(X[i][param-1] * (y[i] - y_pred[i]))
                new_param = np.sum(new_param)
                params[param] = params[param] + self.learning_rate * new_param
            self.params = params
    
    def predict(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred = []  
        for each in pred:
            y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))
        return y_pred
        
    def predict_proba(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred_proba = []  
        for each in pred:
            y_pred_proba.append([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))])
        return y_pred_proba
    
    def get_params(self, deep=False):
        return {'epochs': self.epochs,
                'learning_rate': self.learning_rate}

Посмотрим, что получилось...

In [6]:
logreg_gradient = LogReg_Gradient(epochs=1500, learning_rate=0.0001)
logreg_gradient.fit(X, y)
accuracy_score(y, logreg_gradient.predict(X))

0.95

Ничего себе, получилось:)

Проверим качество на кросс валидации

In [7]:
cv_scores = cross_val_score(logreg_gradient, X, y, fit_params={'epochs':1500, 'learning_rate': 0.0001}, scoring='accuracy', cv=4)
print('CV scores: \t', cv_scores)
print('Mean accuracy: \t', round(cv_scores.mean(), 2))

CV scores: 	 [0.92 0.96 1.   1.  ]
Mean accuracy: 	 0.97


Неплохие результаты, учитывая размера датасета.

### Напишем еще один класс, но уже для Nesterov Accelerated Gradient

In [8]:
class LogReg_Nesterov(object):
    
    def __init__(self, epochs=100, learning_rate=0.0001, momentum=0.9):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.momentum = momentum
    
    def fit(self, X, y, **kwargs):
        # fitting new params
        if len(kwargs) !=0:
            try:
                if len(kwargs['epochs']) !=0:
                    self.epochs = kwargs['epochs']
            except:
                pass
            
            try:
                if len(kwargs['learning_rate']) !=0:
                    self.learning_rate = kwargs['learning_rate']
            except:
                pass
            
            try:
                if len(kwargs['momentum']) !=0:
                    self.momentum = kwargs['momentum']
            except:
                pass

        # set random params
        params = np.random.normal(size=len(X[0])+1)
        for _ in range(self.epochs):
            # make predictions
            pred = []
            for each in X:
                pred.append(1* params[0] +
                            each[0] * params[1] +
                            each[1] * params[2] + 
                            each[2] * params[3] + 
                            each[3] * params[4])

            # y predicted
            y_pred = []  
            for each in pred:
                y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))

            # updating params
            for param in range(len(params)):
                new_param = []
                old_param = 0
                for i in range(len(X)):
                    if param == 0:
                        new_param.append(1 * (y[i] - y_pred[i]))
                    else:
                        new_param.append(X[i][param-1] * (y[i] - y_pred[i]))
                new_param = np.sum(new_param)
                params[param] = params[param] - self.momentum * old_param + self.learning_rate * new_param
                old_param = new_param
            self.params = params
    
    def predict(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred = []  
        for each in pred:
            y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))
        return y_pred
        
    def predict_proba(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred_proba = []  
        for each in pred:
            y_pred_proba.append([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))])
        return y_pred_proba
    
    def get_params(self, deep=False):
        return {'epochs': self.epochs,
                'learning_rate': self.learning_rate}

In [9]:
logreg_nesterov = LogReg_Nesterov(epochs=1500, momentum=0.9)
logreg_nesterov.fit(X, y)
accuracy_score(y, logreg_nesterov.predict(X))

0.96

In [10]:
cv_scores = cross_val_score(logreg_nesterov, X, y, fit_params={'epochs':2000, 'learning_rate': 0.00005, 'momentum': 0.9}, scoring='accuracy', cv=4)
print('CV scores: \t', cv_scores)
print('Mean accuracy: \t', round(cv_scores.mean(), 2))

CV scores: 	 [0.96 0.96 0.88 0.72]
Mean accuracy: 	 0.88


Тоже неплохо, результат лучше, чем с простым градиентом. Пришлось снижать параметр скорости обучения, т.к. инерция спуска иногда вызывала "дребезжание" в точке минимума.

### Настало время опробовать rmsprop

In [11]:
class LogReg_rmsprop(object):
    
    def __init__(self, epochs=100, learning_rate=0.0001, momentum=0.9):
        self.epochs = epochs
        self.learning_rate = learning_rate
        self.momentum = momentum   
    
    def fit(self, X, y, **kwargs):
        # fitting new params
        if len(kwargs) !=0:
            try:
                if len(kwargs['epochs']) !=0:
                    self.epochs = kwargs['epochs']
            except:
                pass
            
            try:
                if len(kwargs['learning_rate']) !=0:
                    self.learning_rate = kwargs['learning_rate']
            except:
                pass
            
            try:
                if len(kwargs['momentum']) !=0:
                    self.momentum = kwargs['momentum']
            except:
                pass
       
        # set random params
        params = np.random.normal(size=len(X[0])+1)
        for _ in range(self.epochs):
            # make predictions
            pred = []
            for each in X:
                pred.append(1* params[0] +
                            each[0] * params[1] +
                            each[1] * params[2] + 
                            each[2] * params[3] + 
                            each[3] * params[4])

            # y predicted
            y_pred = []  
            for each in pred:
                y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))

            # updating params
            for param in range(len(params)):
                new_param = []
                old_param = 0
                for i in range(len(X)):
                    if param == 0:
                        new_param.append(1 * (y[i] - y_pred[i]))
                    else:
                        new_param.append(X[i][param-1] * (y[i] - y_pred[i]))
                new_param = np.sum(new_param)
                new_param = self.momentum * old_param + (1 - self.momentum) * new_param**2
                params[param] = params[param] - self.learning_rate / (new_param**0.5 + 0.1) * new_param
                old_param = new_param
            self.params = params
    
    def predict(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred = []  
        for each in pred:
            y_pred.append(np.argmax([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))]))
        return y_pred
        
    def predict_proba(self, X):
        pred = []
        for each in X:
            pred.append(1 * self.params[0] +
                        each[0] * self.params[1] +
                        each[1] * self.params[2] + 
                        each[2] * self.params[3] + 
                        each[3] * self.params[4])

        # y predicted
        y_pred_proba = []  
        for each in pred:
            y_pred_proba.append([(1 - 1/(1+np.e**-each)), (1/(1+np.e**-each))])
        return y_pred_proba
    
    def get_params(self, deep=False):
        return {'epochs': self.epochs,
                'learning_rate': self.learning_rate}

In [12]:
logreg_rmsprop = LogReg_rmsprop(epochs=1000, momentum=0.5, learning_rate=0.001)
logreg_rmsprop.fit(X, y)
accuracy_score(y, logreg_rmsprop.predict(X))

0.77

In [17]:
cv_scores = cross_val_score(logreg_rmsprop, X, y, fit_params={'epochs':2000, 'learning_rate': 0.001, 'momentum': 0.7}, scoring='accuracy', cv=4)
print('CV scores: \t', cv_scores)
print('Mean accuracy: \t', round(cv_scores.mean(), 2))

CV scores: 	 [1.   0.84 0.92 0.96]
Mean accuracy: 	 0.93


Интересные результаты:
- создается впечатление, что этот алгоритм намного быстрее предыдущих, т.к. за меньшее число эпох мне удавалось получать схожие с предыдущими результаты.
- также можно задавать больший шаг