In [63]:
import sklearn.metrics
from sklearn.datasets import load_boston
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [64]:
RANDOM_STATE = 42

In [65]:
dataset = load_boston()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
y = dataset.target
X


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48


1. Разделите выборку на обучающую и тестовую в отношении 80%/20%

In [66]:
learn_df, test_df, learn_y, test_y = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

print('Size of learn df: '+ str(len(learn_df))+ ', size of test df: '+str(len(test_df)))
print('Size of learn_y: '+str(len(learn_y)) + ', size of test y: '+str(len(test_y)))

Size of learn df: 404, size of test df: 102
Size of learn_y: 404, size of test y: 102


2. Обучите стандартную регрессию, а также Ridge и  Lasso и параметрами по умолчанию и выведите их R2 на тестовой выборке

In [67]:
reg = LinearRegression().fit(learn_df, learn_y)
predicted_y = reg.predict(test_df)
linear_r2 = r2_score(test_y, predicted_y)
ridge = Ridge()
ridge.fit(learn_df, learn_y)
predicted_ridge_y = ridge.predict(test_df)
ridge_r2 = r2_score(test_y, predicted_ridge_y)
lasso = Lasso()
lasso.fit(learn_df, learn_y)
predicted_lasso_y = lasso.predict(test_df)
lasso_r2 = r2_score(test_y,predicted_lasso_y)
print('Linear R2: '+str(linear_r2))
print('Lasso  R2: '+str(lasso_r2))
print('Ridge  R2: '+str(ridge_r2))
print('Linear Regressions shows better results than lasso & ridge')

Linear R2: 0.6687594935356366
Lasso  R2: 0.6671453631686304
Ridge  R2: 0.666222167016852
Linear Regressions shows better results than lasso & ridge


3. Для Ridge и Lasso подберите коэффициент регуляризации(используйте GridSearchCV, RidgeCV, LassoCV) в пределах от $10^{-5}$ до $10^5$ (по степеням 10). Посчитайте R2 на тестовой выборке по лучшим моделям и сравните с предыдущими результатами. Напишите как изменился результат

In [68]:
alpha_arr = [10**-5, 10**-4, 10**-3, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**3, 10**4, 10**5]
alpha_dict = {'alpha': alpha_arr}
kf = KFold(5)

In [69]:
print('Attempt to tune the model with specific alphas')
gcv = GridSearchCV(Ridge(random_state=RANDOM_STATE), alpha_dict, cv=kf)
gcv.fit(learn_df, learn_y)
gcv.get_params()
gcv_predicted = gcv.predict(test_df)
grid_ridge_r2 = r2_score(test_y, gcv_predicted)

rcv = RidgeCV(alphas=alpha_arr, cv=kf)
rcv.fit(learn_df, learn_y)
rcv_predicted = rcv.predict(test_df)
cv_ridge_r2 = r2_score(test_y, rcv_predicted)

print('Grid Ridge R2: '+str(grid_ridge_r2))
print('best_alpha: '+str(gcv.best_params_['alpha']))
print('  RidgeCV  R2: '+str(cv_ridge_r2))
print('best alpha: '+ str(rcv.alpha_))

lcv = LassoCV(alphas = alpha_arr, random_state=RANDOM_STATE, selection='random')
lcv.fit(learn_df, learn_y)
predicted_lcv_y = lcv.predict(test_df)
cv_lasso_r2 = r2_score(test_y, predicted_lcv_y)

glcv = GridSearchCV(Lasso(random_state=RANDOM_STATE, selection='random'), alpha_dict)
glcv.fit(learn_df, learn_y)
glcv_predicted = glcv.predict(test_df)
grid_lasso_r2 = r2_score(test_y, glcv_predicted)

print("Grid Lasso R2: "+str(grid_lasso_r2));
print('best_alpha: '+str(glcv.best_params_['alpha']))
print('   LassoCV R2: '+str(cv_lasso_r2));
print('best_alpha: '+str(lcv.alpha_))
print('Linear     R2: '+str(linear_r2))
print('GridSearchCV over Lasso gives the same result as LassoCV')
print('GridSearchCV over Ridge gives the same result as RidgeCV')
b = grid_lasso_r2 > grid_ridge_r2
print('Tuned lasso shows better result than Ridge')
c = grid_lasso_r2 > linear_r2
print("Tuned lasso shows better result that default LinearRegression")
print("Current best result: "+ str(grid_lasso_r2))
task_3_best_result = grid_lasso_r2



Attempt to tune the model with specific alphas
Grid Ridge R2: 0.6687594856409733
best_alpha: 1e-05
  RidgeCV  R2: 0.6687594856409733
best alpha: 1e-05
Grid Lasso R2: 0.6687598654554501
best_alpha: 1e-05
   LassoCV R2: 0.6687598654554501
best_alpha: 1e-05
Linear     R2: 0.6687594935356366
GridSearchCV over Lasso gives the same result as LassoCV
GridSearchCV over Ridge gives the same result as RidgeCV
Tuned lasso shows better result than Ridge
Tuned lasso shows better result that default LinearRegression
Current best result: 0.6687598654554501


4. Проведите масштабирование выборки(используйте Pipeline, StandardScaler, MinMaxScaler), посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [70]:
lasso_pipeline = make_pipeline(StandardScaler(), RidgeCV(alphas=[10**-5]))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
standard_scaler_ridge_r2 = r2_score(test_y, lasso_pipeline_predicted)
if standard_scaler_ridge_r2 > task_3_best_result:
    print('Ridge at scaled data with StandardScaler shows better result than the best result from task 3')

In [71]:
lasso_pipeline = make_pipeline(MinMaxScaler(), RidgeCV([10**-5]))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
minmax_scaler_ridge_r2 = r2_score(test_y, lasso_pipeline_predicted)
if minmax_scaler_ridge_r2 > task_3_best_result:
    print('Ridge at scaled data with MinMaxScaler shows better result than the best result from task 3')
    

In [72]:
lasso_pipeline = make_pipeline(StandardScaler(), LassoCV(alphas=[10**-5]))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
standard_scaled_lasso_r2 = r2_score(test_y, lasso_pipeline_predicted)
if standard_scaled_lasso_r2 > task_3_best_result:
    print('Lasso at scaled data with StandardScaler shows better result than the best result from task 3')

In [73]:
lasso_pipeline = make_pipeline(MinMaxScaler(), LassoCV(alphas = [10**-5]))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
minmax_scalled_lasso_r2 = r2_score(test_y, lasso_pipeline_predicted)
if minmax_scalled_lasso_r2 > task_3_best_result:
    print('Lasso at scaled with MinMaxScaler shows better result than the best result from task 3')

print('Lasso with scalled by MinMaxScaler data shows better result than just Lasso with the same alpha')
task_4_best_result = minmax_scalled_lasso_r2
print('Current best result (r2): '+str(minmax_scalled_lasso_r2))

Lasso at scaled with MinMaxScaler shows better result than the best result from task 3
Lasso with scalled by MinMaxScaler data shows better result than just Lasso with the same alpha
Current best result (r2): 0.6687605073677362


5. Подберите коэффициент регуляризации для Ridge и Lasso на масштабированных данных, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [74]:
lasso_pipeline = make_pipeline(StandardScaler(), RidgeCV(alphas=alpha_arr))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
standard_scaled_over_ridge_multiple_alphas = r2_score(test_y, lasso_pipeline_predicted)
if standard_scaled_over_ridge_multiple_alphas > task_4_best_result:
    print('This result is better than the best from task 4')


In [75]:
lasso_pipeline = make_pipeline(MinMaxScaler(), RidgeCV(alphas=alpha_arr))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
minmax_scaled_over_ridge_multiple_alphas = r2_score(test_y, lasso_pipeline_predicted)
if minmax_scaled_over_ridge_multiple_alphas > task_4_best_result:
    print('This result is better than the best from task 4')
    print('Task 4 best:    '+str(task_4_best_result))
    print('Current result: '+str(minmax_scaled_over_ridge_multiple_alphas))

This result is better than the best from task 4
Task 4 best:    0.6687605073677362
Current result: 0.6700309977617867


In [76]:
lasso_pipeline = make_pipeline(StandardScaler(), LassoCV(alphas=alpha_arr))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
standard_scaled_over_lasso_multiple_alphas = r2_score(test_y, lasso_pipeline_predicted)
if standard_scaled_over_lasso_multiple_alphas > task_4_best_result:
    print('This result is better than the best from task 4')


In [77]:
lasso_pipeline = make_pipeline(MinMaxScaler(), LassoCV(alphas=alpha_arr))
lasso_pipeline.fit(learn_df, learn_y)
lasso_pipeline_predicted = lasso_pipeline.predict(test_df)
minmax_scaled_over_lasso_multiple_alphas = r2_score(test_y, lasso_pipeline_predicted)
if minmax_scaled_over_lasso_multiple_alphas > task_4_best_result:
   print('This result is better than the best from task 4') 

In [78]:
task_5_best_result = minmax_scaled_over_ridge_multiple_alphas
print('The best result gives ridge at scaled data with MinMax and custom alphas')
print('Best model result is: '+str(task_5_best_result))

The best result gives ridge at scaled data with MinMax and custom alphas
Best model result is: 0.6700309977617867


6. Добавьте попарные произведения признаков и их квадраты (используйте PolynomialFeatures) на масштабированных признаках, посчитайте R2 и сравните с предыдущими результатами. Напишите как изменился результат

In [79]:
current_best_r2_result = 0
current_best_model_description = ''


pp = Pipeline(steps=[('scaler', StandardScaler()),('transform', PolynomialFeatures()), ('RidgeCV', RidgeCV())])
pp.fit(learn_df, learn_y)
predicted = pp.predict(test_df)
a = r2_score(test_y, predicted)
print('StandardScaler + PloynomalFeatures + RidgeCV r2 result: '+str(a))
if a > task_5_best_result:
    current_best_r2_result = a
    current_best_model_description = 'Pipeline with StandardScaler, PolynomialFeatures and RidgeCV'


StandardScaler + PloynomalFeatures + RidgeCV r2 result: 0.8180465877242975


In [80]:
pp = Pipeline(steps=[('scaler', MinMaxScaler()),('transform', PolynomialFeatures()), ('RidgeCV', RidgeCV())])
pp.fit(learn_df, learn_y)
predicted = pp.predict(test_df)
b = r2_score(test_y, predicted)
print('MinMaxScaler + PolynomialFeatures + RidgeCV r2 result: '+str(b))
if b > current_best_r2_result:
    current_best_r2_result = b
    current_best_model_description = 'Pipeline with MinMaxScaler, PolynomialFeatures and RidgeCV'


MinMaxScaler + PolynomialFeatures + RidgeCV r2 result: 0.8500630422288733


In [81]:
pp = Pipeline(steps=[('scaler', StandardScaler()),('transform', PolynomialFeatures()), ('LassoCV', LassoCV(tol=0.001))])
# tol=0.001 according to https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge
pp.fit(learn_df, learn_y)
predicted = pp.predict(test_df)
c = r2_score(test_y, predicted)
print('StandardScaler + PolynomialFeatures + LassoCV r2 result: '+str(c))
if c > current_best_r2_result:
    current_best_r2_result = c
    current_best_model_description = 'Pipeline with StandardScaler, PolynomialFeatures and LassoCV'

StandardScaler + PolynomialFeatures + LassoCV r2 result: 0.816506853062557


In [82]:
pp = Pipeline(steps=[('scaler', MinMaxScaler()),('transform', PolynomialFeatures()), ('LassoCV', LassoCV(tol=0.01))])
# tol=0.001 according to https://stackoverflow.com/questions/20681864/lasso-on-sklearn-does-not-converge
pp.fit(learn_df, learn_y)
predicted = pp.predict(test_df)
d = r2_score(test_y, predicted)
print('MinMaxScaler + PolynomialFeatures + LassoCV r2 result: '+str(d))
if d > current_best_r2_result:
    current_best_r2_result = d
    current_best_model_description = 'Pipeline with MinMaxScaler, PolynomialFeatures and LassoCV'

MinMaxScaler + PolynomialFeatures + LassoCV r2 result: 0.8481225939905862


  model = cd_fast.enet_coordinate_descent(


In [83]:
print('Task 6 best result:')
print('r2: '+str(current_best_r2_result))
print('Best model: '+ current_best_model_description)

Task 6 best result:
r2: 0.8500630422288733
Best model: Pipeline with MinMaxScaler, PolynomialFeatures and RidgeCV


7. Подберите наилучшую модель (используйте Pipeline, GridSearchSCV) подбирая тип регуляризации (L1,L2), коэффициент регуляризации, метод масштабирования и степень полинома в PolynomialFeatures. Выведите итоговые параметры и результат R2. Напишите как изменился R2 по сравнению с предыдущими экспериментами

In [84]:
models = [LassoCV(tol=0.1), RidgeCV(), LinearRegression()]

preprocessor_minmax = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), make_column_selector())
    ]
)

preprocessor_standard = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector())
    ]
)

pipeline = Pipeline(steps=[('preprocessor', preprocessor_minmax),('polynomial_features',PolynomialFeatures()),('model', Lasso())])

params = {
    'preprocessor': [preprocessor_minmax, preprocessor_standard],
    'model': models,
}

grid_search = GridSearchCV(pipeline, params, scoring='r2', error_score='raise')

grid_search.fit(learn_df, learn_y)
print(grid_search.best_estimator_)
print(grid_search.best_score_)
grid_search.cv_results_
                    

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', MinMaxScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15839d1c0>)])),
                ('polynomial_features', PolynomialFeatures()),
                ('model', RidgeCV())])
0.8502343680073727


{'mean_fit_time': array([0.28096032, 0.37051902, 0.40646858, 0.38353605, 0.06006193,
        0.0149672 ]),
 'std_fit_time': array([0.09477513, 0.01410478, 0.06514553, 0.07739451, 0.10405539,
        0.00834401]),
 'mean_score_time': array([0.00730743, 0.00200353, 0.01181746, 0.00305376, 0.00216961,
        0.00172858]),
 'std_score_time': array([0.00737395, 0.00090526, 0.00794978, 0.00081904, 0.00092287,
        0.00103798]),
 'param_model': masked_array(data=[LassoCV(tol=0.1), LassoCV(tol=0.1), RidgeCV(),
                    RidgeCV(), LinearRegression(), LinearRegression()],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_preprocessor': masked_array(data=[ColumnTransformer(transformers=[('num', StandardScaler(),
                                                     <sklearn.compose._column_transformer.make_column_selector object at 0x1583925e0>)]),
                    ColumnTransformer(transformers=[('num', Min

http://archive.ics.uci.edu/ml/datasets/Adult

In [85]:
link = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv'
data = pd.read_csv(link, header=None)

In [86]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


8. Разделите выборку на признаки и целевую переменную(колонка со зачениями {<=50K,>50K}). Замените целевую переменную на числовые значения.

In [87]:
print(str(data.iloc[:, 14].unique()))
map_dict = {'<=50K':0, '>50K':1}
y = data.iloc[:, 14].map(map_dict)
X = data.drop(columns=14)

for a in [1,3,5,6,7,8,9,13]:
    X[a] = X[a].astype("category")
X.info()


['<=50K' '>50K']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   0       48842 non-null  int64   
 1   1       48842 non-null  category
 2   2       48842 non-null  int64   
 3   3       48842 non-null  category
 4   4       48842 non-null  int64   
 5   5       48842 non-null  category
 6   6       48842 non-null  category
 7   7       48842 non-null  category
 8   8       48842 non-null  category
 9   9       48842 non-null  category
 10  10      48842 non-null  int64   
 11  11      48842 non-null  int64   
 12  12      48842 non-null  int64   
 13  13      48842 non-null  category
dtypes: category(8), int64(6)
memory usage: 2.6 MB


9. Выясните, присутствуют ли в данных пропуски. Заполните их самыми частыми значениями (испольуйте SimpleImputer)

In [88]:
assert 0 ==y.isnull().sum()
assert 0 == X.isnull().sum().sum()

10. Выберите колонки с числовыми и категориальными переменными.

In [89]:
X_categories = X[[1,3,5,6,7,8,9,13]]
X_categories.head()

Unnamed: 0,1,3,5,6,7,8,9,13
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba


In [90]:
X_numbers = X[[0,2,4,10,11,12]]
X_numbers.head()

Unnamed: 0,0,2,4,10,11,12
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


11. Создайте пайплайн по обработке колонок(используйте OneHotEncoder,MinMaxScaler).

In [91]:
# https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html
preprocessor = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector(dtype_exclude='category')),
        ("cat", OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_exclude='int64'))
    ]
)
pipe_column_processing = Pipeline(steps=[('preprocessor', preprocessor),('classifier', DummyClassifier(constant=0))])
pipe_column_processing


12. Посчитайте метрики accuracy и f1_score на предсказании только самого частого класса в целевой переменной.

In [92]:
y.value_counts()

0    37155
1    11687
Name: 14, dtype: int64

самый частый класс целевой переменной - 0

In [93]:
learn_df, test_df, learn_y, test_y = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
pipe_column_processing.fit(learn_df, learn_y)
predicted_by_pipe = pipe_column_processing.predict(test_df)

In [94]:
matrix = confusion_matrix(test_y, predicted_by_pipe)
arr = matrix.diagonal()/matrix.sum(axis=1)
accuracy_for_zero = arr[0]
print('Значение метрики accuracy на предсказании для самого частого класса в целевой переменной ', accuracy_for_zero)

Значение метрики accuracy на предсказании для самого частого класса в целевой переменной  1.0


In [95]:
f1_score_for_zero = f1_score(test_y, predicted_by_pipe, average=None)[0]
f1_score_for_zero
print('Значение метрики f1_score на предсказании для самого частого класса в целевой переменной ', f1_score_for_zero)

Значение метрики f1_score на предсказании для самого частого класса в целевой переменной  0.8629459349356923


13. Посчитайте cross_val_score по алгоритмам LogisticRegression, SVC, LinearSVC по метрикам accuracy и f1_score.
Напишите удалось ли превзойти предыдущий результат.

In [96]:
def get_crossval_logistic_regression(X_,y_):
    pipe_logistic_regression = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LogisticRegression(max_iter=1000))])
    r = cross_validate(pipe_logistic_regression, X_, y_, cv=2, scoring=('f1', 'accuracy'))
    logistic_regression_f1 = r['test_f1'].mean()
    logistic_regression_accuracy = r['test_accuracy'].mean()
    return ('f1', logistic_regression_f1), ('accuracy', logistic_regression_accuracy)

logistic_regression_crossval_result = get_crossval_logistic_regression(X, y)
print('Logistic regression: ')
print(logistic_regression_crossval_result)

Logistic regression: 
(('f1', 0.6533980650704612), ('accuracy', 0.8502313582572376))


In [97]:
def get_crossval_linear_svc(X_, y_):
    pipe_linear_svc = Pipeline(steps=[('preprocessor', preprocessor),('classifier', LinearSVC())])
    r = cross_validate(pipe_linear_svc, X_, y_, scoring=('f1', 'accuracy'), cv=2)
    linear_svc_f1 = r['test_f1'].mean()
    linear_svc_accuracy = r['test_accuracy'].mean()
    return ('f1', linear_svc_f1), ('accuracy', linear_svc_accuracy)
linear_svc_crossval_result = get_crossval_linear_svc(X, y)
print('LinearSVC:')
print(linear_svc_crossval_result)

LinearSVC:
(('f1', 0.6549837494565237), ('accuracy', 0.8516645510011875))


In [98]:
def get_crossval_svc(X_, y_):
    pipe_svc = Pipeline(steps=[('preprocessor', preprocessor),('classifier', SVC())])
    r = cross_validate(pipe_svc, X_, y_, scoring=('f1', 'accuracy'), cv=2)
    svc_f1 = r['test_f1'].mean()
    svc_accuracy = r['test_accuracy'].mean()
    return ('f1', svc_f1), ('accuracy', svc_accuracy)
svc_crossval_result = get_crossval_svc(X, y)
print('SVC f1: ')
print(svc_crossval_result)

SVC f1: 
(('f1', 0.613604319107794), ('accuracy', 0.8383358584824536))


14. Можно заметить что в данных присутствуют значения '?', замените их самыми частыми значениями (испольуйте SimpleImputer)

In [99]:
imputer = SimpleImputer(strategy='most_frequent', missing_values='?')
imputer.fit(X)
X_modified = pd.DataFrame(imputer.transform(X))
for a in [1,3,5,6,7,8,9,13]:
    X_modified[a] = X_modified[a].astype("category")

15. Посчитайте cross_val_score на новых данных. Напишите удалось ли улучшить результат.

In [100]:
logistic_regression_crossval_result_modified = get_crossval_logistic_regression(X_modified, y)
print('Old: '+str(logistic_regression_crossval_result))
print('Mod: '+ str(logistic_regression_crossval_result_modified))

Old: (('f1', 0.6533980650704612), ('accuracy', 0.8502313582572376))
Mod: (('f1', 0.6979312885625153), ('accuracy', 0.8690880799312067))


In [101]:
linear_svc_crossval_result_modified = get_crossval_linear_svc(X_modified, y)
print('Old: '+str(linear_svc_crossval_result))
print('Mod: '+str(linear_svc_crossval_result_modified))

Old: (('f1', 0.6549837494565237), ('accuracy', 0.8516645510011875))
Mod: (('f1', 0.6840563373420417), ('accuracy', 0.8581548667130747))


In [102]:
svc_crossval_result_modified = get_crossval_svc(X_modified, y)
print('Old: '+str(svc_crossval_result))
print('Mod: '+ str(svc_crossval_result_modified))

Old: (('f1', 0.613604319107794), ('accuracy', 0.8383358584824536))
Mod: (('f1', 0.6713052953164678), ('accuracy', 0.8636214733221408))


Для всех алгоритмов применение SimpleImputer на входных данных для пропусков дает положительный результат.

16. Посчитайте cross_val_score, если просто удалить значения '?'. Напишите как изменился результат

In [103]:
%%time

copydata = data.copy()
for x in copydata:
    copydata.drop(copydata.loc[copydata[x]=='?'].index, inplace=True)
all_rows_count = data.shape.__getitem__(0)
rows_without_quest_count = copydata.shape.__getitem__(0)
print('All rows count ', all_rows_count, ' rows without ?: ', rows_without_quest_count)
map_dict = {'<=50K':0, '>50K':1}
y_q = copydata.iloc[:, 14].map(map_dict)
X_q = copydata.drop(columns=14)
for a in [1,3,5,6,7,8,9,13]:
    X_q[a] = X_q[a].astype("category")

    
logistic_regression_crossval_result_without_q = get_crossval_logistic_regression(X_q, y_q)
linear_svc_crossval_result_result_without_q = get_crossval_linear_svc(X_q, y_q)
svc_crossval_result_modified_without_q = get_crossval_svc(X_q, y_q)

print('Old: '+ str(logistic_regression_crossval_result))
print('Mod: '+ str(logistic_regression_crossval_result_modified))
print('Cle: '+ str(logistic_regression_crossval_result_without_q))
print('========================================================')

print('Old: '+ str(linear_svc_crossval_result))
print('Mod: '+ str(linear_svc_crossval_result_modified))
print('Cle: '+ str(linear_svc_crossval_result_result_without_q))
print('========================================================')

print('Old: '+ str(svc_crossval_result))
print('Mod: '+ str(svc_crossval_result_modified))
print('Cle: '+ str(svc_crossval_result_modified_without_q))


All rows count  48842  rows without ?:  45222
Old: (('f1', 0.6533980650704612), ('accuracy', 0.8502313582572376))
Mod: (('f1', 0.6979312885625153), ('accuracy', 0.8690880799312067))
Cle: (('f1', 0.6577822213600577), ('accuracy', 0.8461147229224714))
Old: (('f1', 0.6549837494565237), ('accuracy', 0.8516645510011875))
Mod: (('f1', 0.6840563373420417), ('accuracy', 0.8581548667130747))
Cle: (('f1', 0.6598466094519113), ('accuracy', 0.8477953208615275))
Old: (('f1', 0.613604319107794), ('accuracy', 0.8383358584824536))
Mod: (('f1', 0.6713052953164678), ('accuracy', 0.8636214733221408))
Cle: (('f1', 0.6201090696018215), ('accuracy', 0.8337534828180974))
CPU times: user 31.5 s, sys: 254 ms, total: 31.7 s
Wall time: 31.9 s


Вывод: после очистки данных результат лучше не стал. SimpleImputer показал лучшие результаты 

 17. Посчитайте cross_val_score для RandomForestClassifier,GradientBoostingClassifier. Напишите как изменился результат и какой вывод можно из этого сделать.

In [104]:
%%timeit
def get_crossval_random_forest(X_, y_):
    pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', RandomForestClassifier())])
    r = cross_validate(pipe, X_, y_, scoring=('f1', 'accuracy'), cv=2)
    linear_svc_f1 = r['test_f1'].mean()
    linear_svc_accuracy = r['test_accuracy'].mean()
    return ('f1', linear_svc_f1), ('accuracy', linear_svc_accuracy), ('cross_val', cross_val_score(pipe, X_modified, y).mean())
random_forest_crossval_result_modified = get_crossval_random_forest(X_modified, y)

def get_crossval_gradiend_boosting(X_, y_):
    pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', GradientBoostingClassifier())])
    r = cross_validate(pipe, X_, y_, scoring=('f1', 'accuracy'), cv=2)
    linear_svc_f1 = r['test_f1'].mean()
    linear_svc_accuracy = r['test_accuracy'].mean()
    return ('f1', linear_svc_f1), ('accuracy', linear_svc_accuracy),('cross_val', cross_val_score(pipe, X_modified, y).mean())

gradient_boosting_crossval_result_modififed = get_crossval_gradiend_boosting(X_modified, y)
print('RandomForestClassifier on data with SimpleImputer: '+str(random_forest_crossval_result_modified))
print('GradiendBoosting on data with SimpleImputer      : '+str(gradient_boosting_crossval_result_modififed))

RandomForestClassifier on data with SimpleImputer: (('f1', 0.6661432997599606), ('accuracy', 0.860898407108636), ('cross_val', 0.8611031997202536))
GradiendBoosting on data with SimpleImputer      : (('f1', 0.6849184999876994), ('accuracy', 0.8683919577412882), ('cross_val', 0.8673683275305756))
RandomForestClassifier on data with SimpleImputer: (('f1', 0.6694662874987715), ('accuracy', 0.8615126325703288), ('cross_val', 0.8612874392739271))
GradiendBoosting on data with SimpleImputer      : (('f1', 0.6851225183448711), ('accuracy', 0.8685352770156831), ('cross_val', 0.8673888172224749))
RandomForestClassifier on data with SimpleImputer: (('f1', 0.6679039143042007), ('accuracy', 0.8609393554727489), ('cross_val', 0.8611236705488932))
GradiendBoosting on data with SimpleImputer      : (('f1', 0.6864172270190928), ('accuracy', 0.8686990704721347), ('cross_val', 0.8675116547698185))
RandomForestClassifier on data with SimpleImputer: (('f1', 0.6672538078714215), ('accuracy', 0.860877932926

18. Подберите наилучшую модель, подбирая методы обработки колонок - масштабирование признаков, кодирование признаков и заполнение пропусков. Параметры алгоритмов оставьте по умолчанию. Выведите итоговые параметры и результат accuracy и f1_score.

In [107]:
%%timeit

classifiers = [LinearSVC(), LogisticRegression(), RandomForestClassifier(), GradientBoostingClassifier()]

imputer = SimpleImputer(strategy='most_frequent', missing_values='?')
hot_encoder = OneHotEncoder(handle_unknown='ignore')

category_transformer = Pipeline(steps=[('imputer', imputer), ('encoder', hot_encoder)])

preprocessor_minmax = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector(dtype_exclude='category')),
        ("cat", category_transformer, make_column_selector(dtype_exclude='int64'))
    ]
)


preprocessor_standard = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), make_column_selector(dtype_exclude='category')),
        ("cat", category_transformer, make_column_selector(dtype_exclude='int64'))
    ]
)

preprocessor_minmax_without_imputer = ColumnTransformer(
    transformers=[
        ("num", MinMaxScaler(), make_column_selector(dtype_exclude='category')),
        ("cat", hot_encoder , make_column_selector(dtype_exclude='int64'))
    ]
)


preprocessor_standard_without_imputer = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), make_column_selector(dtype_exclude='category')),
        ("cat", hot_encoder, make_column_selector(dtype_exclude='int64'))
    ]
)


pipeline = Pipeline(steps=[('preprocessor', preprocessor_minmax),('classifier', RandomForestClassifier())])

params = {
    'classifier': classifiers,
    'preprocessor': [preprocessor_minmax, preprocessor_standard, preprocessor_minmax_without_imputer, preprocessor_standard_without_imputer]
}

grid_search = GridSearchCV(pipeline, params, scoring='accuracy', error_score='raise')
# grid_search.fit(X, y)
r = cross_validate(grid_search, X_modified, y, scoring=('f1', 'accuracy'), cv=2)
f1 = r['test_f1'].mean()
accuracy = r['test_accuracy'].mean()
print('f1: '+str(f1)+ ' accuracy: '+str(accuracy))
grid_search.fit(X_modified, y)
print(grid_search.best_score_)
print(grid_search.best_estimator_)
                    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1586d9f10>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1584e3d00>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1584134c0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x158648eb0>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1585613d0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x158587ca0>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15888eeb0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x158565c40>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x158566910>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1585742e0>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15840bfd0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x176739c70>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x1584ddfd0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15859dfd0>)])),
                ('classifier', LogisticRegression())])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

f1: 0.7024209153735268 accuracy: 0.870296056672536


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.8710127386527222
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', StandardScaler(),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15841bee0>),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(missing_values='?',
                                                                                 strategy='most_frequent')),
                                                                  ('encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x15841ba00>)])),
                ('classifier', LogisticRegression())])
24min 58s ± 1.68

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
