# Кросс Валидация


https://academy.yandex.ru/handbook/ml/article/kross-validaciya

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

plt.style.use("dark_background")

In [2]:
df = pd.read_csv("/Users/olegyarygin/Projects/Data-Science/Data/churn-modelling/Churn_Modelling.csv")

In [3]:
df.head()

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


# Обучающая, валидационная и тестовая выборка 

In [7]:
from sklearn.model_selection import train_test_split

In [15]:
train, test = train_test_split(df, random_state=42, train_size = 0.6, stratify= df["Exited"]) # random_state чтобы эксперимент навсегда сохранился

In [16]:
val, test = train_test_split(df, random_state=42, train_size = 0.6, stratify= df["Exited"]) # random_state чтобы эксперимент навсегда сохранился

# Машинное обучение 

In [17]:
from catboost import CatBoostClassifier

In [18]:
X = ['CreditScore', 'Age', 'Tenure', 'Balance', 
     'NumOfProducts', "Geography", "Gender", 'HasCrCard','IsActiveMember', 
     'EstimatedSalary']

cat_features = ['Geography','Gender']

y = ['Exited']

In [21]:
from catboost import Pool

train_data = Pool(data=train[X],
                  label=train[y],
                  cat_features=cat_features
                 )

valid_data = Pool(data=val[X],
                  label=val[y],
                  cat_features=cat_features
                 )
test_data = Pool(data=test[X],
                  label=test[y],
                  cat_features=cat_features
                 )
# Pool - объект catboost  

In [24]:
params = {"verbose": 100, 
          "eval_metric":"AUC",
          "loss_function":"Logloss",
          "random_seed": 42,
          "learning_rate": 0.01}

In [25]:
model = CatBoostClassifier(**params)


In [26]:
model.fit(train_data, eval_set=valid_data)

0:	test: 0.8359230	best: 0.8359230 (0)	total: 98.3ms	remaining: 1m 38s
100:	test: 0.8595656	best: 0.8595656 (100)	total: 513ms	remaining: 4.57s
200:	test: 0.8704633	best: 0.8704633 (200)	total: 881ms	remaining: 3.5s
300:	test: 0.8785838	best: 0.8785838 (300)	total: 1.27s	remaining: 2.95s
400:	test: 0.8833394	best: 0.8833394 (400)	total: 1.67s	remaining: 2.49s
500:	test: 0.8880524	best: 0.8880524 (500)	total: 2.05s	remaining: 2.04s
600:	test: 0.8918360	best: 0.8918360 (600)	total: 2.42s	remaining: 1.61s
700:	test: 0.8957661	best: 0.8957661 (700)	total: 2.79s	remaining: 1.19s
800:	test: 0.8995137	best: 0.8995137 (800)	total: 3.15s	remaining: 783ms
900:	test: 0.9029621	best: 0.9029621 (900)	total: 3.52s	remaining: 387ms
999:	test: 0.9062607	best: 0.9062607 (999)	total: 3.89s	remaining: 0us

bestTest = 0.9062607258
bestIteration = 999



<catboost.core.CatBoostClassifier at 0x12954cf10>

In [27]:
n_iters = model.best_iteration_ + 1

In [30]:
params = {"iterations":n_iters,
          "verbose": 100, 
          "eval_metric":"AUC",
          "loss_function":"Logloss",
          "random_seed": 42,
          "learning_rate": 0.01}

In [31]:
model = CatBoostClassifier(**params)


In [32]:
train_full = pd.concat([train, val])

In [33]:
train_full_data = Pool(train_full[X],
                       label=train_full[y],
                       cat_features=cat_features)

In [34]:
model.fit(train_full_data)

0:	total: 9.09ms	remaining: 9.08s
100:	total: 643ms	remaining: 5.73s
200:	total: 1.3s	remaining: 5.16s
300:	total: 2s	remaining: 4.64s
400:	total: 2.67s	remaining: 3.98s
500:	total: 3.33s	remaining: 3.32s
600:	total: 4.08s	remaining: 2.71s
700:	total: 4.78s	remaining: 2.04s
800:	total: 5.5s	remaining: 1.37s
900:	total: 6.23s	remaining: 685ms
999:	total: 7s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x12949c400>

In [35]:
test["y_score_no_cross_val"] = model.predict_proba(test_data)[:,1]

In [36]:
from sklearn.metrics import roc_auc_score

In [37]:
roc_auc_score(test["Exited"], test["y_score_no_cross_val"])

0.8765239668307154

https://catboost.ai/en/docs/features/cross-validation

# Пример кросс валидации

In [38]:
from catboost import cv

In [40]:
params = {"iterations":n_iters,
          "verbose": 100, 
          "eval_metric":"AUC",
          "loss_function":"Logloss",
          "random_seed": 42,
          "learning_rate": 0.01}

In [43]:
cv_data = cv(
    params = params,
    pool = train_full_data,
    fold_count=5,
    shuffle=True,
    partition_random_seed=0,
    stratified=False,
    verbose=False
)

Training on fold [0/5]

bestTest = 0.8863398838
bestIteration = 999

Training on fold [1/5]

bestTest = 0.8975823636
bestIteration = 999

Training on fold [2/5]

bestTest = 0.8944914667
bestIteration = 999

Training on fold [3/5]

bestTest = 0.8841217324
bestIteration = 998

Training on fold [4/5]

bestTest = 0.8922549644
bestIteration = 998



In [48]:
cv_data[cv_data["test-AUC-mean"] ==cv_data["test-AUC-mean"].max()]

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
999,999,0.890955,0.005615,0.305873,0.014137,0.277124,0.004216


In [51]:
n_iters = cv_data[cv_data["test-AUC-mean"] ==cv_data["test-AUC-mean"].max()]["iterations"].values[0]

In [52]:
n_iters


999

In [53]:
# Теперь обучим модель 

In [54]:
params = {"iterations":n_iters,
          "verbose": 100, 
          "eval_metric":"AUC",
          "loss_function":"Logloss",
          "random_seed": 42,
          "learning_rate": 0.01}

In [55]:
model = CatBoostClassifier(**params)


In [57]:
model.fit(train_full_data)

0:	total: 18.6ms	remaining: 18.6s
100:	total: 640ms	remaining: 5.69s
200:	total: 1.3s	remaining: 5.18s
300:	total: 1.98s	remaining: 4.6s
400:	total: 2.72s	remaining: 4.06s
500:	total: 3.44s	remaining: 3.42s
600:	total: 4.19s	remaining: 2.78s
700:	total: 4.93s	remaining: 2.1s
800:	total: 5.7s	remaining: 1.41s
900:	total: 6.46s	remaining: 703ms
998:	total: 7.28s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x12bbc4b50>

In [58]:
len(train_full)

12000

# Подбор гиперпараметров

https://habr.com/ru/companies/otus/articles/527554/

https://habr.com/ru/companies/otus/articles/527554/

https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/2020_06_04_catboost_tutorial/catboost_features.ipynb

https://www.youtube.com/watch?v=ZaP5qFSIcIw

https://github.com/catboost/catboost/blob/master/catboost/tutorials/hyperparameters_tuning/hyperparameters_tuning.ipynb

In [60]:
model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'AUC',
 'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
  'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
 'iterations': 999,
 'sampling_frequency': 'PerTree',
 'fold_permutation_block': 0,
 'leaf_estimation_method': 'Newton',
 'counter_calc_method': 'SkipTest',
 'grow_policy': 'SymmetricTree',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'ctr_leaf_count_limit': 18446744073709551615,
 'bayesian_matrix_reg': 0.10000000149011612,
 'one_hot_max_size': 2,
 'eval_fraction': 0,
 'force_unit_auto_pair_weights': False,
 'l2_leaf_reg': 3,
 'random_strength': 1,
 'rsm': 1,
 'boost_from_average': False,
 'max_ctr_complexity': 4,
 'model_size_reg': 0.5,
 'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntro

In [61]:
params = {"verbose": 100, 
          "eval_metric":"AUC",
          "loss_function":"Logloss",
          "random_seed": 42}

In [62]:
model = CatBoostClassifier(**params)


In [63]:
grid = {
    "learning_rate": [0.01, 0.1],
    "depth": [5,6]
}

In [64]:
result = model.grid_search(grid, train_full_data, verbose=False)

0:	test: 0.8034306	best: 0.8034306 (0)	total: 18.5ms	remaining: 18.4s
100:	test: 0.8575938	best: 0.8575948 (99)	total: 541ms	remaining: 4.81s
200:	test: 0.8650117	best: 0.8650421 (199)	total: 1.04s	remaining: 4.15s
300:	test: 0.8707590	best: 0.8707590 (300)	total: 1.56s	remaining: 3.63s
400:	test: 0.8741134	best: 0.8741134 (400)	total: 2.06s	remaining: 3.08s
500:	test: 0.8766431	best: 0.8766431 (500)	total: 2.57s	remaining: 2.56s
600:	test: 0.8784525	best: 0.8784525 (600)	total: 3.09s	remaining: 2.05s
700:	test: 0.8803105	best: 0.8803197 (699)	total: 3.62s	remaining: 1.54s
800:	test: 0.8819031	best: 0.8819031 (800)	total: 4.18s	remaining: 1.04s
900:	test: 0.8837530	best: 0.8837693 (896)	total: 4.71s	remaining: 518ms
999:	test: 0.8852808	best: 0.8852808 (999)	total: 5.27s	remaining: 0us

bestTest = 0.8852807954
bestIteration = 999

Metric AUC is not calculated on train by default. To calculate this metric on train, add hints=skip_train~false to metric parameters.
0:	test: 0.8034306	best

In [66]:
result["params"]

{'depth': 6, 'learning_rate': 0.1}

In [67]:
pd.DataFrame(result["cv_results"])["test-AUC-mean"].max()

0.9516663571909986

# Принципы sklearn 

https://tproger.ru/translations/scikit-learn-in-python/

# Предобработка и фичеинженеринг 

https://catboost.ai/en/docs/concepts/python-usages-examples

# Калибровка

https://www.youtube.com/watch?v=ZaP5qFSIcIw

# Catboost и тексты и эмбеддинги 

https://github.com/catboost/catboost/blob/master/catboost/tutorials/events/2020_06_04_catboost_tutorial/text_features.ipynb

https://youtu.be/ZaP5qFSIcIw?t=3802

# Главные навыки для аналитика


- SQL
- pandas
- Основы статистики (AB тесты)

# Главные навыки для Дата Саентиста

- Все то же, что и у аналитика
- Машинное обучение (sklearn, catboost)
- более продвинутый python (основы ООП)
- linux, git, docker, веб-сервисы

# Как развиваться дальше ?

- [Прикладное машинное обучение с помощью Scikit-Learn, Keras и TensorFlow: концепции, инструменты и техники для создания интеллектуальных систем, 2-е издание | Жерон Орельен](https://www.ozon.ru/product/prikladnoe-mashinnoe-obuchenie-s-pomoshchyu-scikit-learn-keras-i-tensorflow-kontseptsii-207392052/?sh=HC92ZQyhUQ)
- Задачи на kaggle, вот [список](https://www.kaggle.com/getting-started/114864) для начинающих
- Откликайся на вакансии и решай тестовые. Можешь смотреть [тестовые по дс у меня на канале](https://www.youtube.com/playlist?list=PLQJ7ptkRY-xbefSg1XN3FA-SdSRFcCQfn)
- [Мой курс по статистике](https://www.youtube.com/playlist?list=PLQJ7ptkRY-xbHLLI66KdscKp_FJt0FsIi)
- Задачи по pandas и sql на [StrataScratch](https://www.stratascratch.com/?via=gleb)
- [Мой курс по SQL](https://www.udemy.com/course/sql-with-gleb/?referralCode=C99EC81AE75FEC9F50A2)
- Задачи на [itresume](https://itresume.ru/)
- Задачи на [Checkio](https://checkio.org/)
- Задачи на [Leetcode](https://leetcode.com/) вот по этому [списку](https://seanprashad.com/leetcode-patterns/)
- Мой [плейлист](https://www.youtube.com/playlist?list=PLQJ7ptkRY-xZ4qiXlmQQLgAPyXJhQ7HxZ) по основам Linux
- [Основы Git](https://youtu.be/0cGIiA0AjNw)
- [Основы докера](https://www.youtube.com/playlist?list=PLQJ7ptkRY-xbR0ka2TUxJkXna40XWu92m)
- [Основы веб-сервисов](https://www.youtube.com/playlist?list=PLQJ7ptkRY-xYLEAC5Y_sKqrJ9RA-U7Dja)

# Дополнительные материалы

- [Как работает градиентный бустинг](https://youtu.be/ZNJ3lKyI-EY)
- [Как учить английский](https://youtu.be/f9Q4Mwvd5pc)
- [Как искать работу](https://deepnote.com/@gleb-mikhaylov-15d6/blog-posts-nXDqO5CTQYOreNE53H3y-A)