# Продвинутое программирование на языке Python
## Семинар 4. Принципы ООП

На этом занятии постараемся разобрать логику `.__init__()` -> `.fit()` -> `.transform()` (`.predict()`). Это пригодится нам при работе с `sklearn`.

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

In [4]:
X = np.random.uniform(-10, 10, (100, 2))
y = np.random.normal(0, 1, 100)

X_new = np.random.uniform(-10, 10, (50, 2))

In [4]:
lr = LinearRegression()  # .__init__()
# Y = b0*1 + b1*X1 + b2*X2 + ... + bn*Xn

In [5]:
dir(lr)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_intercept',
 '_validate_data',
 '_validate_params',
 'copy_X',
 'fit',
 'fit_intercept',
 'get_metadat

In [8]:
lr.fit(X, y)

In [9]:
dir(lr)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_decision_function',
 '_doc_link_module',
 '_doc_link_template',
 '_doc_link_url_param_generator',
 '_estimator_type',
 '_get_default_requests',
 '_get_doc_link',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_set_intercept',
 '_validate_data',
 '_validate_params',
 'coef_',
 'copy_X',
 'fit',
 'fit_intercept',
 'g

In [10]:
lr.coef_

array([-0.0186808 , -0.00875918])

В процессе анализа данных нам часто нужно будет трансформировать их каким-либо образом. Рассмотрим две главных трансформации: стадартизацию и нормализацию.

![st&norm](https://i.stack.imgur.com/XmyWR.png)

Наша задача - реализовать класс `Transformer`.

При этом:
- для инициализации класса необходим вид трансформации;
- на вход может подаваться последовательность из целых / нецелых чисел (возможные виды: `list`, `tuple`, `np.ndarray`, `pd.DataFrame`). Последовательности (если это не `np.ndarray` или `pd.DataFrame`) должны иметь степень вложенности 2 (список списков);
- параметры для трансформации должны сохраняться в качестве атрибутов объекта (для каждой переменной);
- должна поддерживаться логика `fit`-`transform`.

In [33]:
class Transformer:
    # атрибут класса
    legal_ttypes = ['std', 'norm']
    legal_structures = [list, tuple, np.ndarray, pd.DataFrame]
    
    def __init__(self, ttype):
        if ttype not in self.legal_ttypes:
            raise AttributeError(f'ttype {ttype} not in legal ttypes: {self.legal_ttypes}')
            
        self.ttype = ttype  # атрибут объекта

    def _validate_data(self, data):
        data_structure = type(data)

        # check if structure is valid
        if data_structure not in self.legal_structures:
            raise ValueError(f'data structure {data_structure} not in legal structures: {self.legal_structures}')

        if data_structure in [list, tuple]:
            data = np.array(data)
        elif data_structure == pd.DataFrame:
            data = data.values

        # check if data type is valid
        if data.dtype not in [float, int]:
            raise ValueError(f'data type must be int or float, not {data.dtype}')

        # check if dimensionality is valid
        if data.ndim != 2:
            raise ValueError(f'dimensions number must be 2, not {data.ndim}')

        # data now is valid and has type np.ndarray
        return data

    def fit(self, data):
        data_valid = self._validate_data(data)
        
        # compute params
        if self.ttype == 'norm':
            params = {
                'min': data_valid.min(axis=0),
                'max': data_valid.max(axis=0),
            }
            
        else:
            params = {
                'mean': data_valid.mean(axis=0),
                'std': data_valid.std(axis=0),
            }
    
        # save params
        self.params = params

    def transform(self, data):
        if not hasattr(self, 'params'):
            raise AttributeError('you cannot perform transform before fit!')

        data_valid = self._validate_data(data)

        # apply params
        if self.ttype == 'norm':
            data_transformed = (data_valid - self.params['min']) / (self.params['max'] - self.params['min'])
        else:
            data_transformed = (data_valid - self.params['mean']) / self.params['std']

        return data_transformed

In [34]:
tr = Transformer('norm')

tr.fit(X)

X_transformed = tr.transform(X)

In [35]:
X_transformed

array([[0.18714158, 0.14893271],
       [0.60680481, 0.83715122],
       [0.83070881, 0.92072828],
       [0.38397983, 0.74590534],
       [0.61380683, 0.95455356],
       [0.79764314, 0.9742809 ],
       [0.06377219, 0.52619375],
       [0.85193922, 0.67887248],
       [0.56193989, 0.87302428],
       [0.75954943, 0.99711976],
       [0.61029222, 0.66461224],
       [0.47600446, 0.85477688],
       [0.50189538, 0.33474738],
       [0.15572021, 0.23844606],
       [0.98989083, 0.20838733],
       [0.37830994, 0.48413494],
       [0.53680398, 0.82434837],
       [0.        , 0.21453369],
       [0.53152489, 0.77509339],
       [0.67173897, 0.80603934],
       [0.01771542, 0.53845087],
       [0.97063704, 0.5478539 ],
       [0.08377275, 0.69806814],
       [0.63256329, 0.7809404 ],
       [0.27144544, 0.01069787],
       [0.20489527, 0.68048083],
       [0.58632755, 0.75249931],
       [0.58670033, 0.06074734],
       [0.70460358, 0.41200471],
       [0.15407757, 0.49955641],
       [0.

In [36]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X)

X_std = sc.transform(X)

In [37]:
X_std

array([[-1.0370919 , -1.12216475],
       [ 0.40802488,  1.13415462],
       [ 1.17904175,  1.40816141],
       [-0.35927637,  0.83500568],
       [ 0.43213647,  1.51905734],
       [ 1.06517961,  1.58373329],
       [-1.4619163 ,  0.11468282],
       [ 1.25214899,  0.61523893],
       [ 0.25353186,  1.25176422],
       [ 0.93400334,  1.65861034],
       [ 0.42003385,  0.56848683],
       [-0.04238812,  1.19194023],
       [ 0.04676767, -0.51297271],
       [-1.14529188, -0.82869588],
       [ 1.7271875 , -0.92724321],
       [-0.37880073, -0.02320667],
       [ 0.16697596,  1.0921806 ],
       [-1.68151683, -0.90709243],
       [ 0.14879731,  0.9306985 ],
       [ 0.63162667,  1.03215456],
       [-1.62051352,  0.1548677 ],
       [ 1.6608868 ,  0.18569546],
       [-1.39304409,  0.67817178],
       [ 0.49672462,  0.94986787],
       [-0.74679028, -1.57536669],
       [-0.97595679,  0.6205119 ],
       [ 0.33751114,  0.85662398],
       [ 0.3387948 , -1.41127986],
       [ 0.74479648,

In [28]:
tr.params

{'min': array([-9.84716524, -9.98738626]),
 'max': array([9.93219991, 9.82099992])}

In [62]:
dir(tr)

['_Transformer__secret_method',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_validate_data',
 'fit',
 'legal_ttypes',
 'ttype']

In [31]:
Transformer.legal_ttypes

['std', 'norm']

In [32]:
dir(tr)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'legal_ttypes',
 'ttype']

In [26]:
print(tr)

I am TRANSFORMER


In [23]:
print(2)

2


In [21]:
tr.ttype

'norm'

In [13]:
?list.index

[0;31mSignature:[0m [0mlist[0m[0;34m.[0m[0mindex[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mvalue[0m[0;34m,[0m [0mstart[0m[0;34m=[0m[0;36m0[0m[0;34m,[0m [0mstop[0m[0;34m=[0m[0;36m9223372036854775807[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return first index of value.

Raises ValueError if the value is not present.
[0;31mType:[0m      method_descriptor

### Усовершенствуем класс

In [39]:
class Transformer:
    # атрибут класса
    legal_structures = [list, tuple, np.ndarray, pd.DataFrame]
    
    def __init__(self):
        pass

    def _validate_data(self, data):
        data_structure = type(data)

        # check if structure is valid
        if data_structure not in self.legal_structures:
            raise ValueError(f'data structure {data_structure} not in legal structures: {self.legal_structures}')

        if data_structure in [list, tuple]:
            data = np.array(data)
        elif data_structure == pd.DataFrame:
            data = data.values

        # check if data type is valid
        if data.dtype not in [float, int]:
            raise ValueError(f'data type must be int or float, not {data.dtype}')

        # check if dimensionality is valid
        if data.ndim != 2:
            raise ValueError(f'dimensions number must be 2, not {data.ndim}')

        # data now is valid and has type np.ndarray
        return data

    def _compute_params(self, data):  # this method MUST be specified in child
        pass

    def _apply_params(self, data):  # this method MUST be specified in child
        pass

    def fit(self, data):
        data_valid = self._validate_data(data)
        
        # compute params
        params = self._compute_params(data_valid)
    
        # save params
        self.params = params

    def transform(self, data):
        if not hasattr(self, 'params'):
            raise AttributeError('you cannot perform transform before fit!')

        data_valid = self._validate_data(data)

        # apply params
        data_transformed = self._apply_params(data_valid)

        return data_transformed

In [47]:
class Standardizer(Transformer):
    def _compute_params(self, data):
        params = {
            'mean': data.mean(axis=0),
            'std': data.std(axis=0)
        }

        return params

    def _apply_params(self, data):
        data_transformed = (data - self.params['mean']) / self.params['std']

        return data_transformed

class Normalizer(Transformer):
    def _compute_params(self, data):
        params = {
            'min': data.min(axis=0),
            'max': data.max(axis=0)
        }

        return params

    def _apply_params(self, data):
        data_transformed = (data - self.params['min']) / (self.params['max'] - self.params['min'])

        return data_transformed

In [44]:
std = Standardizer()

In [None]:
std.