RANDOM FORESTS

In [None]:
# birden çok karar ağacınının ürettiği tahminlerin bir araya getirilerek değerlendirilmesine dayanır.
# ağaçlar için gözlemler bootstrap rastgele örnek seçim yöntemi ile (bagging);
# değişkenler ise random subspace yöntemi ile seçilir.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RandomizedSearchCV, validation_curve

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [4]:
df = pd.read_csv("diabetes.csv")
y = df['Outcome']
x = df.drop(['Outcome'], axis = 1)

In [None]:
# model
rf_model = RandomForestClassifier(random_state=17)

cv_results = cross_validate(rf_model, x, y, cv=10, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.75
cv_results['test_f1'].mean()  # 0.61
cv_results['test_roc_auc'].mean()  # 0.82

rf_params = {"max_depth": [5, 8, None],
            "max_features": [3, 5, 7, "auto"],
             "min_samples_split": [2, 5, 8, 15, 20],
             "n_estimators": [100, 200, 500]}

rf_best_grid = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=False).fit(x, y)
rf_best_grid.best_params_

In [6]:
rf_final = rf_model.set_params(**rf_best_grid.best_params_, random_state=17).fit(x, y)
cv_results = cross_validate(rf_final, x, y, cv=10, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.76
cv_results['test_f1'].mean()  # 0.64
cv_results['test_roc_auc'].mean()  # 0.82

0.8271054131054132

GRADIENT BOOSTING

In [None]:
# birden fazla zayıf tahminleyiciyi birleştirerek güçlü bir tahminci oluşturur.
# ilk model temel sonraki modeller artık modeldir.

In [None]:
gbm_model = GradientBoostingClassifier(random_state=17)

cv_results = cross_validate(gbm_model, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.75
cv_results['test_f1'].mean()  # 0.63
cv_results['test_roc_auc'].mean()  # 0.82

0.8262372466806429

In [None]:
gbm_params = {"learning_rate": [0.01, 0.1],
              "max_depth": [3, 8, 10],
             "n_estimators": [100,500,1000],
              "subsample": [1, 0.5, 0.7]}

gbm_best_grid = GridSearchCV(gbm_model, gbm_params, cv=5, n_jobs=-1, verbose=False).fit(x, y)
gbm_best_grid.best_params_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


{'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.7}

In [None]:
gbm_final = gbm_model.set_params(**gbm_best_grid.best_params_, random_state=17).fit(x, y)
cv_results = cross_validate(gbm_final, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.77
cv_results['test_f1'].mean()  # 0.66
cv_results['test_roc_auc'].mean()  # 0.83

0.834911250873515

XGBOOST

In [None]:
# gbm' in hız ve tahmin performansını arttırmak için yapılmıştır.

In [None]:
!pip install xgboost



In [None]:
from xgboost import XGBClassifier

In [None]:
#xgboost_model = XGBClassifier(random_state=17)

#cv_results = cross_validate(xgboost_model, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
#cv_results['test_accuracy'].mean()  # 0.75
#cv_results['test_f1'].mean()  # 0.63
#cv_results['test_roc_auc'].mean()  # 0.79

# hata verdi ama başarı oranları böyle

LightGBM

In [None]:
# xgboost' un eğitim süresi performansını arttırmaya yöneliktir.

In [None]:
lgbm_model = LGBMClassifier(random_state=17)
cv_results = cross_validate(lgbm_model, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.74
cv_results['test_f1'].mean()  # 0.62
cv_results['test_roc_auc'].mean()  # 0.79

In [None]:
lgbm_params = {"learning_rate": [0.01, 0.1],
              "n_estimators": [100, 300, 500, 1000],
               "colsample_bytree": [0.5, 0.7, 1]}
lgbm_best_grid = GridSearchCV(lgbm_model, lgbm_params, cv=5, n_jobs=-1, verbose=False).fit(x, y)
lgbm_final = lgbm_model.set_params(**lgbm_best_grid.best_params_, random_state=17).fit(x, y)
cv_results = cross_validate(lgbm_final, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.76
cv_results['test_f1'].mean()  # 0.63
cv_results['test_roc_auc'].mean()  # 0.81

CATBOOST

In [None]:
# kategorik değişkenler için hızlı ve başarılı br gbm türevi.

In [None]:
catboost_model = CatBoostClassifier(random_state=17, verbose=False)
cv_results = cross_validate(catboost_model, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.77
cv_results['test_f1'].mean()  # 0.65
cv_results['test_roc_auc'].mean()  # 0.83

0.7735251676428148

In [None]:
catboost_params = {"iterations": [200, 500],
                   "learning_rate": [0.01, 0.1],
                   "depth": [3, 6]}
catboost_best_grid = GridSearchCV(catboost_model, catboost_params, cv=5, n_jobs=-1, verbose=True).fit(x, y)
catboost_final = catboost_model.set_params(**catboost_best_grid.best_params_, random_state=17).fit(x, y)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
cv_results = cross_validate(catboost_final, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.77
cv_results['test_f1'].mean()  # 0.63
cv_results['test_roc_auc'].mean()  # 0.84

0.842001397624039

Random Search CV

In [None]:
# GridSearchCV' ye göre daha geniş bir alanda arama yapar ve rastgele seçtiği
# değerleri bizim söylediğimiz iterasyon kadar dener.
# GridSearchCV de değerleri biz veriyorduk.

In [None]:
rf_model = RandomForestClassifier(random_state=17)
rf_random_params = {"max_depth": np.random.randint(5, 50, 10),
                    "max_features": [3, 5, 7, "auto", "sqrt"],
                    "min_samples_split": np.random.randint(2, 50, 20),
                    "n_estimators": [int(x) for x in np.linspace(start=200, stop=1500, num=10)]}
rf_random_grid = RandomizedSearchCV(rf_model, rf_random_params, n_iter=100, cv=3, n_jobs=-1,random_state=42, verbose=True).fit(x, y)


In [None]:
rf_random_final = rf_model.set_params(**rf_random_grid.best_params_, random_state=17).fit(x, y)

In [None]:
cv_results = cross_validate(rf_random_final, x, y, cv=5, scoring=["accuracy", "f1", "roc_auc"])
cv_results['test_accuracy'].mean()  # 0.76
cv_results['test_f1'].mean()  # 0.63
cv_results['test_roc_auc'].mean()  # 0.83

0.8349091544374563