In [1]:
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline


In [2]:
RANDOM_STATE = 42

In [3]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target

X.tail()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.12,76.7,2.2875,1.0,273.0,21.0,396.9,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.9,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48
505,0.04741,0.0,11.93,0.0,0.573,6.03,80.8,2.505,1.0,273.0,21.0,396.9,7.88


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404,), (102,))

2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [5]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
r2 = r2_score(y_test, y_pred)
r2

0.6687594935356278

In [6]:
ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2_ridge = r2_score(y_test, y_pred)
r2_ridge

0.666222167016852

In [7]:
%%time
lasso = Lasso(alpha=1)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2_lasso = r2_score(y_test, y_pred)
r2_lasso

CPU times: total: 0 ns
Wall time: 7.83 ms


0.6671453631686304

3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [8]:
grid_search_cv_lasso = GridSearchCV(lasso, {'alpha': [10**i for i in range(-5,6)]}, cv=5)
grid_search_cv_lasso.fit(X_train, y_train)
grid_search_cv_lasso.best_params_

{'alpha': 1e-05}

In [9]:
grid_search_cv_ridge = GridSearchCV(ridge, {'alpha': [10**i for i in range(-5,6)]}, cv=5)
grid_search_cv_ridge.fit(X_train, y_train)
grid_search_cv_ridge.best_params_

{'alpha': 1e-05}

In [10]:
lasso = Lasso(alpha=10**-5)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
r2_lasso_cv = r2_score(y_test, y_pred)
r2_lasso_cv

ridge = Ridge(alpha=10**-5)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
r2_ridge_cv = r2_score(y_test, y_pred)
r2_ridge_cv

print(r2_lasso_cv, r2_ridge_cv, sep='\n')

0.6687598638315153
0.6687594856409733


In [11]:
lasso_cv = LassoCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_train, y_train)
y_pred = lasso_cv.predict(X_test)
r2_lasso_cv = r2_score(y_test, y_pred)
r2_lasso_cv

0.6687598638315153

In [12]:
ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_train, y_train)
y_pred = ridge_cv.predict(X_test)
r2_ridge_cv = r2_score(y_test, y_pred)
r2_ridge_cv

0.6687594856409733

In [13]:
# R2 вырос на десятые процента
print((r2_lasso_cv-r2_lasso)*100, (r2_ridge_cv-r2_ridge)*100, sep='\n')

0.16145006628849323
0.2537318624121321


In [14]:
# Непонятно, в связи с чем get_params не подсвечивает лучший alpha
ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_train, y_train)
ridge_cv.get_params()

{'alpha_per_target': False,
 'alphas': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000, 100000],
 'cv': 5,
 'fit_intercept': True,
 'gcv_mode': None,
 'normalize': 'deprecated',
 'scoring': None,
 'store_cv_values': False}

4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [15]:
pipeline = Pipeline([('scaler',StandardScaler()), ('clf', Lasso(alpha=10**-5))])
pipeline.fit(X_train, y_train)
r2_lasso_scaler = pipeline.score(X_test, y_test)
r2_lasso_scaler

0.668759038334717

In [16]:
pipeline = Pipeline([('scaler',StandardScaler()), ('clf', Ridge(alpha=10**-5))])
pipeline.fit(X_train, y_train)
r2_ridge_scaler = pipeline.score(X_test, y_test)
r2_ridge_scaler


0.6687594905710064

In [17]:
# R2 существенно не изменился
print((r2_lasso_scaler-r2_lasso_cv)*100, (r2_ridge_scaler-r2_ridge_cv)*100, sep='\n')

-8.254967983623018e-05
4.930033048466953e-07


In [19]:
# Нормализация (0,1) привела к несущественному росту R2 к стандартизации
pipeline = Pipeline([('scaler', MinMaxScaler()), ('clf', Lasso(alpha=10**-5))])
pipeline.fit(X_train, y_train)
r2_lasso_scaler = pipeline.score(X_test, y_test)
r2_lasso_scaler

0.6687605073677362

In [20]:
pipeline = Pipeline([('scaler',MinMaxScaler()), ('clf', Ridge(alpha=10**-5))])
pipeline.fit(X_train, y_train)
r2_ridge_scaler = pipeline.score(X_test, y_test)
r2_ridge_scaler

0.6687596289762525

5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [21]:
scaler = StandardScaler()
scaler.fit(X)
X_0 = pd.DataFrame(scaler.transform(X))

X_0.columns = dataset.feature_names
X_0.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
501,-0.413229,-0.487722,0.115738,-0.272599,0.158124,0.439316,0.018673,-0.625796,-0.982843,-0.803212,1.176466,0.387217,-0.418147
502,-0.415249,-0.487722,0.115738,-0.272599,0.158124,-0.234548,0.288933,-0.716639,-0.982843,-0.803212,1.176466,0.441052,-0.50085
503,-0.413447,-0.487722,0.115738,-0.272599,0.158124,0.98496,0.797449,-0.773684,-0.982843,-0.803212,1.176466,0.441052,-0.983048
504,-0.407764,-0.487722,0.115738,-0.272599,0.158124,0.725672,0.736996,-0.668437,-0.982843,-0.803212,1.176466,0.403225,-0.865302
505,-0.415,-0.487722,0.115738,-0.272599,0.158124,-0.362767,0.434732,-0.613246,-0.982843,-0.803212,1.176466,0.441052,-0.669058


In [22]:
y_0 = y
y_0 = y_0.reshape(-1,1)
scaler.fit(y_0)
y_0 = scaler.transform(y_0)

y_0[:5]

array([[ 0.15968566],
       [-0.10152429],
       [ 1.32424667],
       [ 1.18275795],
       [ 1.48750288]])

In [23]:
X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(X_0, y_0, test_size=0.2, random_state=42)
X_0_train.shape, X_0_test.shape, y_0_train.shape, y_0_test.shape

((404, 13), (102, 13), (404, 1), (102, 1))

In [24]:
# Вывод: незначительно хуже, чем без стандартизации
ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_0_train, y_0_train)

r2_ridge_cv = ridge_cv.score(X_0_test, y_0_test)
r2_ridge_cv


0.6684401592810274

In [25]:
# Можно было не нормализовать y (несущественно на данной выборке?)
scaler = StandardScaler()
scaler.fit(X)
X_0 = pd.DataFrame(scaler.transform(X))
X_0.columns = dataset.feature_names

X_0_train, X_0_test, y_train, y_test = train_test_split(X_0, y, test_size=0.2, random_state=42)
X_0_train.shape, X_0_test.shape, y_train.shape, y_test.shape

ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_0_train, y_train)
r2_ridge_cv = ridge_cv.score(X_0_test, y_test)

lasso_cv = LassoCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_0_train, y_train)
r2_lasso_cv = lasso_cv.score(X_0_test, y_test)

print(r2_lasso_cv, r2_ridge_cv, sep='\n')

0.6687590128437058
0.6684401592810272


In [26]:
# Вывод: нормализация (0,1) приводит к росту R2 на 0,15% при l2 регуляризации, незначительное изменение при l1 регуляризации
scaler = MinMaxScaler()
scaler.fit(X)
X_0 = pd.DataFrame(scaler.transform(X))
X_0.columns = dataset.feature_names
X_0.tail()

X_0_train, X_0_test, y_0_train, y_0_test = train_test_split(X_0, y, test_size=0.2, random_state=42)
X_0_train.shape, X_0_test.shape, y_0_train.shape, y_0_test.shape

ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_0_train, y_0_train)
r2_ridge_cv = ridge_cv.score(X_0_test, y_0_test)

lasso_cv = LassoCV(alphas=[10**i for i in range(-5,6)], cv=5).fit(X_0_train, y_0_train)
r2_lasso_cv = lasso_cv.score(X_0_test, y_0_test)

print(r2_lasso_cv, r2_ridge_cv, sep='\n')

0.668760771409485
0.6702995459264811


In [28]:
# Лаконичный код с RidgeCV/LassoCV , если не требуется выводить df / target с масш.значениями
pipeline = Pipeline([('scaler', MinMaxScaler()), ('clf', RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5))])
pipeline.fit(X_train, y_train)
r2_ridge_scaler = pipeline.score(X_test, y_test)
r2_ridge_scaler

0.6700309977617651

6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [29]:
poly = PolynomialFeatures(2)
X_1 = poly.fit_transform(X_0)
X_1 = pd.DataFrame(X_1)

X_1.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
504,1.0,0.001161,0.0,0.420455,0.0,0.386831,0.619467,0.889804,0.114514,0.0,...,0.026936,0.146662,0.162694,0.021512,0.798551,0.885843,0.117127,0.982677,0.12993,0.01718
505,1.0,0.000462,0.0,0.420455,0.0,0.386831,0.473079,0.802266,0.125072,0.0,...,0.026936,0.146662,0.164122,0.027852,0.798551,0.893617,0.151649,1.0,0.169702,0.028799


In [30]:
#Вывод: R2 вырос на 18,1-18,2%

X_1_train, X_1_test, y_1_train, y_1_test = train_test_split(X_1, y, test_size=0.2, random_state=42)

lasso_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5)
lasso_cv.fit(X_1_train, y_1_train)
r2_lasso_scaler_1 = lasso_cv.score(X_1_test, y_1_test)

ridge_cv = RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5)
ridge_cv.fit(X_1_train, y_1_train)
r2_ridge_scaler_1 = ridge_cv.score(X_1_test, y_1_test)

print(100*(r2_lasso_scaler_1-r2_lasso_cv), 100*(r2_ridge_scaler_1-r2_ridge_cv), sep='\n')

18.231340122170646
18.077462670471036


In [31]:
%%time
# Вариант лаконичного кода
# Разница в R2 меньше на 1% по l1 регуляризации в отличии то того, когда масштабируем и добавляем полиномы вручную,
# (в чем причина?)

pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures()), 
('clf', LassoCV(alphas=[10**i for i in range(-5,6)], cv=5))])
pipeline.fit(X_train, y_train)
r2_lasso_scaler_2 = pipeline.score(X_test, y_test)

pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures()), 
('clf', RidgeCV(alphas=[10**i for i in range(-5,6)], cv=5))])
pipeline.fit(X_train, y_train)
r2_ridge_scaler_2 = pipeline.score(X_test, y_test)

print(100*(r2_lasso_scaler_2-r2_lasso_cv), 100*(r2_ridge_scaler_2-r2_ridge_cv), sep='\n')


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent(


17.029739664234565
17.97634963023933
CPU times: total: 547 ms
Wall time: 652 ms


7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [32]:
pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures()),
('clf', Ridge())])

parameters = [
    {
        'transform_1': (MinMaxScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Ridge(),),
        'clf__alpha': [10**i for i in range(-5,6)],
    }, {
        'transform_1': (MinMaxScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Lasso(),),
        'clf__alpha': [10**i for i in range(-5,6)]
    }, {
        'transform_1': (StandardScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Ridge(),),
        'clf__alpha': [10**i for i in range(-5,6)],
    }, {
        'transform_1': (StandardScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Lasso(),),
        'clf__alpha': [10**i for i in range(-5,6)]
    }
]

In [33]:
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

{'clf': Lasso(alpha=0.001),
 'clf__alpha': 0.001,
 'transform_1': MinMaxScaler(),
 'transform_2__degree': 3}

In [34]:
# Результат ниже на 0,6% чем наиболее высокие из ранее найденных R2 
best_clf = grid_search.best_estimator_
best_clf.get_params()
score = best_clf.score(X_test, y_test)

r2_ridge_scaler_1 = ridge_cv.score(X_1_test, y_1_test)
print(score, r2_lasso_scaler_1, r2_ridge_scaler_1, sep='\n')

0.8449734005131505
0.8510741726311915
0.8510741726311915


In [35]:
# Результат > 85% (самый высокий) был достингут применением LassoCV/RidgeCV
#  на стандартный полином (2) и масштабирование (0,1), проверим ниже на Ridge - R2 выше 85%.
#  Есть вероятность, что пайплайн выше отработал некорректно из-за ошибки.
pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures(3)), 
('clf', RidgeCV(alphas=[10**i for i in range(-5,6)]))])
pipeline.fit(X_train, y_train)
r2 = pipeline.score(X_test, y_test)
r2

0.8588479918651002

In [36]:
# Поскольку ошибка возникает на lasso, возьмем только ridge
#  R2 уже выше остальных (86%), адекватный результат
pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures()),
('clf', Ridge())])

parameters = [
    {
        'transform_1': (MinMaxScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Ridge(),),
        'clf__alpha': [10**i for i in range(-5,6)],
    }, {
        'transform_1': (StandardScaler(),),
        'transform_2__degree': range(2,4),
        
        'clf': (Ridge(),),
        'clf__alpha': [10**i for i in range(-5,6)],
    }
]
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_train, y_train)
best_clf = grid_search.best_estimator_
score = best_clf.score(X_test, y_test)
score

0.8588479918651302

In [37]:
best_clf.get_params()

{'memory': None,
 'steps': [('transform_1', MinMaxScaler()),
  ('transform_2', PolynomialFeatures(degree=3)),
  ('clf', Ridge(alpha=0.1))],
 'verbose': False,
 'transform_1': MinMaxScaler(),
 'transform_2': PolynomialFeatures(degree=3),
 'clf': Ridge(alpha=0.1),
 'transform_1__clip': False,
 'transform_1__copy': True,
 'transform_1__feature_range': (0, 1),
 'transform_2__degree': 3,
 'transform_2__include_bias': True,
 'transform_2__interaction_only': False,
 'transform_2__order': 'C',
 'clf__alpha': 0.1,
 'clf__copy_X': True,
 'clf__fit_intercept': True,
 'clf__max_iter': None,
 'clf__normalize': 'deprecated',
 'clf__positive': False,
 'clf__random_state': None,
 'clf__solver': 'auto',
 'clf__tol': 0.001}

In [38]:
# Более лаконичный код, все еще предполож-но некорректная работа с l1 регуляризацией, почему?
pipeline = Pipeline([('transform_1', MinMaxScaler()), ('transform_2', PolynomialFeatures()),
('clf', Ridge())])

parameters = [
    {
        'transform_1': (MinMaxScaler(),StandardScaler()),
        'transform_2__degree': range(2,4),
        
        'clf': (Ridge(),Lasso()),
        'clf__alpha': [10**i for i in range(-5,6)],
    }
]
grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_train, y_train)
best_clf = grid_search.best_estimator_
score = best_clf.score(X_test, y_test)
score

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

0.8449734005131505

http://archive.ics.uci.edu/ml/datasets/Adult

In [2]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [3]:
data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,0,0,40,United-States,<=50K
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,60,United-States,>50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [7]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]
y.unique()

array([0, 1], dtype=object)

In [8]:
y[y=='<=50K'] = 0
y[y=='>50K'] = 1
y=y.astype('int')
y.unique()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y=='<=50K'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[y=='>50K'] = 1


array([0, 1])

9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [43]:
# NaN отсутствуют
X[X==np.nan].count()


0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [44]:
X[X==0].count()

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10    44807
11    46560
12        0
13        0
dtype: int64

In [45]:
# Заемним нули для тренировки
from sklearn.impute import SimpleImputer

X_test = X

imp = SimpleImputer(missing_values=0, strategy='most_frequent')
imp.fit(X_test.iloc[:,10:12])
X_test.iloc[:,10:12] = imp.transform(X_test.iloc[:,10:12])

In [46]:
X_test[X_test==0].count()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
dtype: int64

In [47]:
# Поскольку требовалось заменить пропуски а не нули, далее работаем с исходным X
imp = SimpleImputer(strategy='most_frequent')
imp.fit(X)
X = imp.transform(X)
X

array([[39, 'State-gov', 77516, ..., 1902, 40, 'United-States'],
       [50, 'Self-emp-not-inc', 83311, ..., 1902, 13, 'United-States'],
       [38, 'Private', 215646, ..., 1902, 40, 'United-States'],
       ...,
       [38, 'Private', 374983, ..., 1902, 50, 'United-States'],
       [44, 'Private', 83891, ..., 1902, 40, 'United-States'],
       [35, 'Self-emp-inc', 182148, ..., 1902, 60, 'United-States']],
      dtype=object)

In [48]:
X = pd.DataFrame(X)
X.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,15024,1902,36,United-States
48838,64,?,321403,HS-grad,9,Widowed,?,Other-relative,Black,Male,15024,1902,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,1902,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,1902,40,United-States
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,1902,60,United-States


In [49]:
# Странно что датафреймы не идентичны, тк проверка isna / ==np.nan показала что
#  пропусков нет, далее станет понятно что слетели типы данных колонок
X.equals(data.iloc[:,:-1])

False

10. Выберите колонки с числовыми и категориальными переменными.

In [67]:
# Теперь понтяно почему датафреймы не были идентичны. Слетел тип колонок после SimpleImputer
X.dtypes

0      Int64
1     string
2      Int64
3     string
4      Int64
5     string
6     string
7     string
8     string
9     string
10     Int64
11     Int64
12     Int64
13    string
dtype: object

In [68]:
X = X.convert_dtypes()
X.dtypes

0      Int64
1     string
2      Int64
3     string
4      Int64
5     string
6     string
7     string
8     string
9     string
10     Int64
11     Int64
12     Int64
13    string
dtype: object

In [69]:
X_num = X.select_dtypes('number')
X_num.columns

Int64Index([0, 2, 4, 10, 11, 12], dtype='int64')

In [187]:
X_obj = X.select_dtypes(exclude='number')
X_obj.columns

Int64Index([1, 3, 5, 6, 7, 8, 9, 13], dtype='int64')

11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn2pmml import PMMLPipeline
from sklearn_pandas import DataFrameMapper
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import precision_score, f1_score


In [183]:
# ColumnTransformer и mapper применяют OneHotEncoder ко всем колонкам, формируя из поданных фичей одну в виде string
#  Ответ был найедн под конец домашки, необходимо оборачивать масшт-е функции в ColTransformer

trf1 = ColumnTransformer(transformers =[
    ('cat', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
      
])

first_step = trf1.fit_transform(X)
pd.DataFrame(first_step).head()

Unnamed: 0,0
0,"(0, 7)\t1.0\n (0, 18)\t1.0\n (0, 29)\t1.0\..."
1,"(0, 6)\t1.0\n (0, 18)\t1.0\n (0, 27)\t1.0\..."
2,"(0, 4)\t1.0\n (0, 20)\t1.0\n (0, 25)\t1.0\..."
3,"(0, 4)\t1.0\n (0, 10)\t1.0\n (0, 27)\t1.0\..."
4,"(0, 4)\t1.0\n (0, 18)\t1.0\n (0, 27)\t1.0\..."


In [None]:
mapper = DataFrameMapper(
    [(d, LabelEncoder()) for d in [0, 2, 4, 10, 11, 12]]
)

lm = PMMLPipeline([("mapper", mapper),
                   ("regressor", OneHotEncoder())])
lm.fit(X, y)
Df = pd.DataFrame(lm.transform(X))
Df.head()

Unnamed: 0,0
0,"(0, 22)\t1.0\n (0, 3535)\t1.0\n (0, 28609)..."
1,"(0, 33)\t1.0\n (0, 3862)\t1.0\n (0, 28609)..."
2,"(0, 21)\t1.0\n (0, 18416)\t1.0\n (0, 28605..."
3,"(0, 36)\t1.0\n (0, 20069)\t1.0\n (0, 28603..."
4,"(0, 11)\t1.0\n (0, 25479)\t1.0\n (0, 28609..."


In [97]:
# В таком случае работаем вручную с категор. и количеств. признаками
enc = OneHotEncoder()
enc.fit(X_obj)

a = enc.transform(X_obj).toarray()
b = pd.DataFrame(a.tolist())
b.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,92,93,94,95,96,97,98,99,100,101
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48841,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [98]:
enc = MinMaxScaler()
enc.fit(X_num)

a = enc.transform(X_num)
c = pd.DataFrame(a)
c.tail()

Unnamed: 0,0,1,2,3,4,5
48837,0.30137,0.137428,0.8,0.149272,0.415853,0.357143
48838,0.643836,0.20913,0.533333,0.149272,0.415853,0.397959
48839,0.287671,0.245379,0.8,0.149272,0.415853,0.5
48840,0.369863,0.048444,0.8,0.053471,0.415853,0.397959
48841,0.246575,0.114919,0.8,0.149272,0.415853,0.602041


In [99]:
# Итоговый дф
Df = pd.concat([b, c], axis=1)
Df.columns = [i for i in range(0,108)]
Df.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.137428,0.8,0.149272,0.415853,0.357143
48838,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.20913,0.533333,0.149272,0.415853,0.397959
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.8,0.149272,0.415853,0.5
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.8,0.053471,0.415853,0.397959
48841,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.246575,0.114919,0.8,0.149272,0.415853,0.602041


In [197]:
# Для кор.работы с OneHot в ColTransformer необходимо выводить результаты в array, а не в sparse matrix
trf2 = ColumnTransformer(transformers =[
    ('cat', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
      
], sparse_threshold=0)

first_step = trf2.fit_transform(X)
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.044131,0.8,0.020624,0.415853,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.8,0.149272,0.415853,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.149272,0.415853,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.4,0.149272,0.415853,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.8,0.149272,0.415853,0.397959


In [198]:
# Еще одина вариант с ColTransformer

trf2 = ColumnTransformer(transformers =[
    ('cat', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
      
])

first_step = trf2.fit_transform(X)
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.044131,0.8,0.020624,0.415853,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.8,0.149272,0.415853,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.149272,0.415853,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.4,0.149272,0.415853,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.8,0.149272,0.415853,0.397959


In [218]:
pipel = Pipeline([
    ('cat', trf2), ('clf', SVC())
      
])

first_step = trf2.fit_transform(X)
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.044131,0.8,0.020624,0.415853,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.8,0.149272,0.415853,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.149272,0.415853,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.4,0.149272,0.415853,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.8,0.149272,0.415853,0.397959


In [217]:
pipel_1 = Pipeline([
    ('cat', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12]), ('clf', SVC())
      
])

first_step = trf2.fit_transform(X)
Dg = pd.DataFrame(first_step)
Dg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,107
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.044131,0.8,0.020624,0.415853,0.397959
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.452055,0.048052,0.8,0.149272,0.415853,0.122449
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.137581,0.533333,0.149272,0.415853,0.397959
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.493151,0.150486,0.4,0.149272,0.415853,0.397959
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.150685,0.220635,0.8,0.149272,0.415853,0.397959


In [219]:
# обучение пайплайна работает с ColTransfromer, но не работает
#  с OneHot непосредственно в пайплайне
pipel.fit(X, y)
pipel.get_params()

{'memory': None,
 'steps': [('cat',
   ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse=False),
                                    [1, 3, 5, 6, 7, 8, 9, 13]),
                                   ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12])])),
  ('clf', SVC())],
 'verbose': False,
 'cat': ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse=False),
                                  [1, 3, 5, 6, 7, 8, 9, 13]),
                                 ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12])]),
 'clf': SVC(),
 'cat__n_jobs': None,
 'cat__remainder': 'drop',
 'cat__sparse_threshold': 0.3,
 'cat__transformer_weights': None,
 'cat__transformers': [('cat',
   OneHotEncoder(sparse=False),
   [1, 3, 5, 6, 7, 8, 9, 13]),
  ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12])],
 'cat__verbose': False,
 'cat__verbose_feature_names_out': True,
 'cat__cat': OneHotEncoder(sparse=False),
 'cat__num': MinMaxScaler(),
 'cat__cat__categories': 'auto',
 'cat__cat__drop': None,
 'cat__cat__d

In [76]:
# Обучили пайплан на Df, которая собрана вручную

pipeline = Pipeline([('clf', SVC())])
pipeline.fit(Df, y)
pipeline.get_params()

{'memory': None,
 'steps': [('clf', SVC())],
 'verbose': False,
 'clf': SVC(),
 'clf__C': 1.0,
 'clf__break_ties': False,
 'clf__cache_size': 200,
 'clf__class_weight': None,
 'clf__coef0': 0.0,
 'clf__decision_function_shape': 'ovr',
 'clf__degree': 3,
 'clf__gamma': 'scale',
 'clf__kernel': 'rbf',
 'clf__max_iter': -1,
 'clf__probability': False,
 'clf__random_state': None,
 'clf__shrinking': True,
 'clf__tol': 0.001,
 'clf__verbose': False}

In [101]:
abc = pipeline.decision_function(Df)

In [167]:
len(abc)

48842

In [107]:
# Предсказанные значения на SVC
abcc = pipeline.predict(Df)

In [111]:
np.unique(abcc)

array([0, 1])

12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [126]:
pd.DataFrame(y).groupby(y).count()

Unnamed: 0_level_0,14
14,Unnamed: 1_level_1
0,37155
1,11687


In [131]:
# Самый частый класс - 0 (<= 50K)
# Accuracy для самого частого класса - это Precision для этого класса TP/(TP+FP)
# В качестве классификатора возьмем SVC

# abcc = pipeline.predict(Df)

precision_score(y, abcc, pos_label=0)


0.8667835671342685

In [135]:
f1_score(y, abcc, pos_label=0)

0.8978786895880637

13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [169]:
# Вывод: в предыдущем шаге выбрали SVC. Для LinearSVC и LogRegr значение cross_val выше на 0,3%,
#  precision и f1 выше на десятые % и ниже на десятые % соответственно


# SVC

cross_val_SVC = cross_val_score(pipeline, Df, y).mean()
cross_val_SVC

0.8326441452759397

In [150]:
pipe_linearSVC = make_pipeline(LinearSVC())
cross_val_linearSVC = cross_val_score(pipe_linearSVC, Df, y).mean()
cross_val_linearSVC

0.8358586103109596

In [227]:
# Для сходимости лог.регр-ии по гр.спуску задали max_iter
pipe_regr = make_pipeline(LogisticRegression(max_iter=300))
cross_val_regr = cross_val_score(pipe_regr, Df, y).mean()
cross_val_regr

0.8358176665577627

In [225]:
pipe_linearSVC.fit(Df,y)
abcc_linear = pipe_linearSVC.predict(Df)
print(precision_score(y, abcc_linear, pos_label=0))
print(f1_score(y, abcc_linear, pos_label=0))

0.8677966955470482
0.8965768485148129


In [228]:
pipe_regr.fit(Df,y)
abcc_regr = pipe_regr.predict(Df)
print(precision_score(y, abcc_regr, pos_label=0))
print(f1_score(y, abcc_regr, pos_label=0))

0.8684662390788344
0.8961527938819911


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [152]:
imp = SimpleImputer(missing_values='?', strategy='most_frequent')
imp.fit(X_obj)
X_obj = imp.transform(X_obj)

In [153]:
enc = OneHotEncoder()
enc.fit(X_obj)

a = enc.transform(X_obj).toarray()
b = pd.DataFrame(a.tolist())
b.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,89,90,91,92,93,94,95,96,97,98
48837,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48841,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [154]:
Df_1 = pd.concat([b, c], axis=1)
Df_1.columns = [i for i in range(0,105)]
Df_1.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,95,96,97,98,99,100,101,102,103,104
48837,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.30137,0.137428,0.8,0.149272,0.415853,0.357143
48838,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.643836,0.20913,0.533333,0.149272,0.415853,0.397959
48839,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.287671,0.245379,0.8,0.149272,0.415853,0.5
48840,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.369863,0.048444,0.8,0.053471,0.415853,0.397959
48841,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.246575,0.114919,0.8,0.149272,0.415853,0.602041


15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [220]:
# Вопрос: Можно ли утверждать, что cross_val_score НЕ всегда (!) показывает наиболее оптимальную модель
#  для обучения? Ведь при каждом шаге кросс-валидации класс-ор ищет другие оптимальные функции для класс-ции объектов
# То есть SVC (как пример) с определенными параметрами показавший не луший cv_score (при выборе в grid search или по рез-ам cross_val_score.mean)
#  может в целом по всем объектам строить наиболее оптимальную разделяющую плоскость для объектов (для тестовой выборки в неклассификационных моделях)

# Изменение cross_val нулевое, однако в гугл коллаб разница получилась на 0,1-0,2%. 
# Как оптимизировать работу кода, ноут не мощный, но и в гугл коллаб выполняется более 10 минут

cross_val_SVC_New = cross_val_score(pipe_SVC, Df_1, y).mean()

cross_val_linearSVC_New = cross_val_score(pipe_linearSVC, Df_1, y).mean()

cross_val_regr_New = cross_val_score(pipe_regr, Df_1, y).mean()

print((cross_val_SVC_New-cross_val_SVC)*100, (cross_val_linearSVC_New-cross_val_linearSVC)*100,
 (cross_val_regr_New-cross_val_regr)*100, sep='\n')

0.0
0.0
0.0


16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [229]:
ad = pd.concat([X, y], axis=1)

ad = ad[ad!='?']
ad.dropna(inplace=True)
ad.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,Male,15024,1902,40,United-States,0
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,15024,1902,36,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,15024,1902,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,1902,40,United-States,0
48841,35,Self-emp-inc,182148,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,1902,60,United-States,1


In [230]:
ad_X = ad.iloc[:,:-1]
ad_y = ad.iloc[:,-1]

In [257]:
# Столкнулся с проблемой при применении OneHot к датасету, так как
#  одна из категорий 13 столбца осталась в количестве 1, убираем соотв. строку из дф
# ValueError: Found unknown categories ['Holand-Netherlands'] in column 7 during transform
print(ad_X[ad_X.iloc[:,13] == 'Holand-Netherlands'].index.values)
ad_X_drop = ad_X.drop(19609)
ad_y_drop = ad_y.drop(19609)

[19609]


In [265]:
# Результат хуже на десятые %
trf = ColumnTransformer(transformers =[
    ('cat', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('num', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
])

pipel = Pipeline([
    ('cat', trf), ('clf', SVC())
])
print(cross_val_score(pipel, ad_X_drop, ad_y_drop
    ``).mean())

pipel = Pipeline([
    ('cat', trf), ('clf', LinearSVC())
])
print(cross_val_score(pipel, ad_X_drop, ad_y_drop
    ``).mean())

pipel = Pipeline([
    ('cat', trf), ('clf', LogisticRegression(max_iter=300))
])
print(cross_val_score(pipel, ad_X_drop, ad_y_drop
    ``).mean())

#first_step = trf.fit_transform(X)
#pd.DataFrame(first_step).head()



0.8291723284408465
0.8327547187156263
0.8322240045533794


 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [263]:
# CV score вырос на несколько процентов на гр.бустинге и несущественно на рэндом форест
# Вывод: наиболее подходящая среди моделей это гр.бустинг на масштабируемых очищенных данных 

# Вопрос: Почему традиционно считается что гр.бустинг применим только для класс-ии?
#  Ведь можем улучшать некласс-ую модель, если требуется снизить смещение

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

pipel = Pipeline([
    ('cat', trf), ('clf', RandomForestClassifier())
])
print(cross_val_score(pipel, ad_X_drop, ad_y_drop).mean())

0.8368899812696311


In [264]:
pipel = Pipeline([
    ('cat', trf), ('clf', GradientBoostingClassifier())
])
print(cross_val_score(pipel, ad_X_drop, ad_y_drop).mean())

0.8603746978410811


18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [11]:
print(X[X.iloc[:,13] == 'Holand-Netherlands'].index.values)
X_drop = X.drop(19609)
y_drop = y.drop(19609)

[19609]


In [12]:
# MinMax, StScaler, OneHot, SimpleImputer
# 
# Есть задача: применить OneHot к категор.колонкам и работать с остальной частью датафрейма в пайплайне,
# перебирая MinMax, Scaler, Polynom (через grid_search)
# Вопрос 1: - Можем ли мы засунуть машстабируемый метод в пайплайн так,
#             чтобы он применился и заменил указанные колонки на преобразованные (а не оставил только эти колонки)?
#          - Пробовал бороться с этим через ColTransformer, но колонки не заменяются а прибавляются последовательным применением
#             разных функций (т.е. остаются в т.ч. начальные столбцы, если вторая функция применятся для тех же колонок что предыдущая)
#          - Есть ли оптим. метод для решения такой задачи? Иначе нужно отдельно обрабатывать Df для использования в пайплайне, и не будет
#             возможности перебрать в grid_search параметры SimpleImputer, OneHot для категор. фичей и другие методы для количеств-х фичей


trf_a = ColumnTransformer(transformers = [
    ('trf_a_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_a_2', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
    ('trf_a_3', PolynomialFeatures(), [0, 2, 4, 10, 11, 12])
    ])
trf_b = ColumnTransformer(transformers = [
    ('trf_b_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_b_2', StandardScaler(), [0, 2, 4, 10, 11, 12]),
    ('trf_b_3', PolynomialFeatures(), [0, 2, 4, 10, 11, 12])
    ])
trf_c = ColumnTransformer(transformers = [
    ('trf_c_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_c_2', MinMaxScaler(), [0, 2, 4, 10, 11, 12])
    ])
trf_d = ColumnTransformer(transformers = [
    ('trf_d_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_d_2', StandardScaler(), [0, 2, 4, 10, 11, 12])
    ])
# trf_complex = ColumnTransformer(transformers = [
#    ('trf_a_1', MinMaxScaler(), [0, 2, 4, 10, 11, 12]),
#    ('trf_a_2', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13])
#    ('trf_a_3', PolynomialFeatures())
#    ], sparse_threshold=0)
# На всех фичах полином будет долго работать...

pipeline = Pipeline([('transform_2', trf_a), ('clf', SVC())])

parameters = [
    {
        'transform_2': (trf_a, trf_b, trf_c, trf_d),

        'clf': (SVC(),LinearSVC(), LogisticRegression(max_iter=300)),
    }
]

grid_search = GridSearchCV(pipeline, parameters, cv=5)
grid_search.fit(X_drop, y_drop)
best_clf = grid_search.best_estimator_
score = best_clf.score(X_drop, y_drop)
score

# C учетом того что при применении функция в ColTransf для одних и тех же фичей у нас не происходит замещений значений,
#  как вариант можно убрать полином из trf_a и trf_b, и добавить его в пайплайн с подбором по degree: [1,2], но тогда полином применится
#  в т.ч. к отсортированным по OneHot. SimpleImputer можно обернуть еще в один ColTrans по нужным фичам, 
#  а по оставшимся колонкам применить нейтральную вставку (подойдет SimpleImputer без параметров).
# Но для применения grid_search слишком много ColTransf... Данный код выполнялся 74 минуты

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8652975983292726

In [13]:
# Лучший вариант это стандартизация с моделью опорных векторов
best_clf.get_params()

{'memory': None,
 'steps': [('transform_2',
   ColumnTransformer(transformers=[('trf_d_1', OneHotEncoder(sparse=False),
                                    [1, 3, 5, 6, 7, 8, 9, 13]),
                                   ('trf_d_2', StandardScaler(),
                                    [0, 2, 4, 10, 11, 12])])),
  ('clf', SVC())],
 'verbose': False,
 'transform_2': ColumnTransformer(transformers=[('trf_d_1', OneHotEncoder(sparse=False),
                                  [1, 3, 5, 6, 7, 8, 9, 13]),
                                 ('trf_d_2', StandardScaler(),
                                  [0, 2, 4, 10, 11, 12])]),
 'clf': SVC(),
 'transform_2__n_jobs': None,
 'transform_2__remainder': 'drop',
 'transform_2__sparse_threshold': 0.3,
 'transform_2__transformer_weights': None,
 'transform_2__transformers': [('trf_d_1',
   OneHotEncoder(sparse=False),
   [1, 3, 5, 6, 7, 8, 9, 13]),
  ('trf_d_2', StandardScaler(), [0, 2, 4, 10, 11, 12])],
 'transform_2__verbose': False,
 'transform_2__verb

In [15]:
best_clf.fit(X_drop, y_drop)

In [17]:
predict = best_clf.predict(X_drop)
print(precision_score(y_drop, predict, pos_label=0))
print(f1_score(y_drop, predict, pos_label=0))

0.8843301405334741
0.9144818083737375


In [36]:
# Для решения проблемы в ColTransf можно встроить пайплайн, но есть ли более оптимальное?
#  Это решение не позволит нам перебирать параметры трансформаторов в grid_search
#  Код выполнялся более часа
# Вопрос 2: Есть предположение, что после OneHot нет смысла использовать доп. масштабирование, верно ли?
#  Классификатор четко разбивает по колонкам, и видимо нет смысла переобучать. А полином применять на нормализованные значения

trf_a = ColumnTransformer([
    ('trf_a_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_a_2', Pipeline([
        ('1', MinMaxScaler()),
        ('2', PolynomialFeatures())
        ]), [0, 2, 4, 10, 11, 12]) 
    ], remainder ='passthrough')


pipeline = Pipeline([('transform', trf_a), ('clf', SVC())])

parameters = [
    {
        'transform__transformers': ([
    ('trf_a_1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_a_2', Pipeline([
        ('1', MinMaxScaler()),
        ('2', PolynomialFeatures())
        ]), [0, 2, 4, 10, 11, 12]) 
    ],

    [ # Убираем '?' и применяем MinMax вместо Standart
    ('trf_a_1', Pipeline([
        ('1', SimpleImputer(missing_values='?', strategy='most_frequent')),
        ('2', OneHotEncoder(sparse=False))
        ]), [1, 3, 5, 6, 7, 8, 9, 13]),
    ('trf_a_2', Pipeline([
        ('1', StandardScaler()),
        ('2', PolynomialFeatures())
        ]), [0, 2, 4, 10, 11, 12]) 
    ]),

        'clf': (SVC(),LinearSVC(), LogisticRegression(max_iter=300),
        GradientBoostingClassifier(), RandomForestClassifier()),
    }
]

grid_search_1 = GridSearchCV(pipeline, parameters, cv=5)
grid_search_1.fit(X_drop, y_drop)
best_clf_1 = grid_search_1.best_estimator_
score_1 = best_clf_1.score(X_drop, y_drop)
score_1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8696177391945292

In [37]:
# Еще более высокие значения

best_clf_1.fit(X_drop, y_drop)
predict = best_clf_1.predict(X_drop)
print(precision_score(y_drop, predict, pos_label=0))
print(f1_score(y_drop, predict, pos_label=0))

0.885036770223623
0.9174402323289944


In [None]:
# Если бы работало быстро, то можно было б запустить полноценный отбор ниже

### Заметка: В ColTrans не работает make_pipline, который позволяет не называть трансформаторы

trf_a = ColumnTransformer([
    ('1', MinMaxScaler(), [0, 2, 4, 10, 11, 12])
    ], remainder ='passthrough')

trf_b = ColumnTransformer([
    ('1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13])
    ], remainder ='passthrough')

pipeline = Pipeline([('transform1', trf_a), ('transform2', trf_b), ('clf', SVC())])

parameters = [
    {
        'transform1__transformers': ([
    ('1', MinMaxScaler(), [0, 2, 4, 10, 11, 12]) 
    ],

    [
    ('1', StandardScaler(), [0, 2, 4, 10, 11, 12]) 
    ],

    [
    ('1', Pipeline([
        ('1', StandardScaler()),
        ('2', PolynomialFeatures())
        ]), [0, 2, 4, 10, 11, 12])
    ],

    [
    ('1', Pipeline([
        ('1', MinMaxScaler()),
        ('2', PolynomialFeatures())
        ]), [0, 2, 4, 10, 11, 12])
    ]),
  
        'transform2__transformers': ([       
    ('1', OneHotEncoder(sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]) 
    ],

    [
    ('1', Pipeline([
        ('1', SimpleImputer(missing_values='?', strategy='most_frequent')),
        ('2', OneHotEncoder(sparse=False))
        ]), [1, 3, 5, 6, 7, 8, 9, 13])
    ]),

        'clf': (SVC(),LinearSVC(), LogisticRegression(max_iter=300),
        GradientBoostingClassifier(), RandomForestClassifier()),
    }
]