# Doğrusal Olmayan Regresyon Modelleri

In [3]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

from warnings import filterwarnings
filterwarnings('ignore')

# KNN

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)

In [None]:
X.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [None]:
knn_model

In [None]:
y_pred = knn_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

Hatırlatma

- train-test

- tum veri ile cv (gormediği verideki performans?)

- train-test (train uzerine cv, test ile test edilir)

# KNN Model Tuning

In [None]:
knn_model

In [None]:
RMSE = []

for k in range(20):
    k = k + 2
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    print("k =", k, "için RMSE değeri:", rmse)

In [None]:
#GridSearchCV

In [None]:
knn_params = {"n_neighbors": np.arange(2,30,1)}

knn_model = KNeighborsRegressor()

knn_cv_model = GridSearchCV(knn_model, knn_params, cv = 10).fit(X_train, y_train)

In [None]:
knn_cv_model.best_params_

In [None]:
knn_tuned = KNeighborsRegressor(**knn_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = knn_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

# SVR 

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
svr_model = SVR("linear").fit(X_train, y_train)

In [None]:
svr_model

In [None]:
y_pred = svr_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

# SVR Tuning

In [None]:
svr_model = SVR("linear") 

svr_params = {"C": [0.01,0.001, 0.2, 0.1,0.5,0.8,0.9,1]}

svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 5, n_jobs = -1, verbose =  2).fit(X_train, y_train)

In [None]:
svr_cv_model.best_params_

In [None]:
svr_tuned = SVR("linear", C = 0.001).fit(X_train, y_train)

In [None]:
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
?SVR

In [None]:
#nonlinear
svr_model = SVR() 

svr_params = {"C": [0.01,0.001, 0.2, 0.1,0.5,0.8,0.9,1, 10, 100, 500,1000]}

svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 5, n_jobs = -1, verbose =  2).fit(X_train, y_train)

In [None]:
svr_cv_model.best_params_

In [None]:
svr_tuned = SVR(**svr_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

# Yapay Sinir Ağları

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
scaler = StandardScaler()

In [None]:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
scaler.fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [None]:
mlp_model = MLPRegressor().fit(X_train_scaled, y_train)

In [None]:
?mlp_model

In [None]:
y_pred = mlp_model.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test, y_pred))

# Model Tuning

In [None]:
mlp_params = {"alpha": [0.1, 0.01, 0.02, 0.001, 0.0001], 
             "hidden_layer_sizes": [(10,20), (5,5), (100,100), (1000,100,10)]}

In [None]:
mlp_cv_model = GridSearchCV(mlp_model, mlp_params, cv = 10, verbose = 2, n_jobs = -1).fit(X_train_scaled, y_train)

In [None]:
mlp_cv_model.best_params_

In [None]:
mlp_tuned = MLPRegressor(**mlp_cv_model.best_params_).fit(X_train_scaled, y_train)

In [None]:
y_pred = mlp_tuned.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test, y_pred))

# CART

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
cart_model = DecisionTreeRegressor(random_state = 52)
cart_model.fit(X_train, y_train)

In [None]:
#Bir fonksiyonun türevinin sıfır olduğu nokta eğitimidir, gradyanıdır, bu noktanın negatif yonu ilgili fonksiyonu azaltır.

In [None]:
y_pred = cart_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
#agac gelistirirken verinin 3'te 2'si train için 3'te 1'i başarı değerlendirme için kullanılır.

# Model Tuning

In [None]:
?cart_model

In [None]:
cart_params = {"max_depth": [2,3,4,5,10,20, 100, 1000],
              "min_samples_split": [2,10,5,30,50,10]}

In [None]:
cart_model = DecisionTreeRegressor()

In [None]:
cart_cv_model = GridSearchCV(cart_model, cart_params, cv = 10).fit(X_train, y_train)

In [None]:
cart_cv_model.best_params_

In [None]:
cart_tuned = DecisionTreeRegressor(**cart_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = cart_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
cart_tuned = DecisionTreeRegressor(max_depth = 2).fit(X_train, y_train)

In [None]:
#!pip install skompiler
from skompiler import skompile
print(skompile(cart_tuned.predict).to('python/code'))

# Random Forests

In [28]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [29]:
rf_model = RandomForestRegressor(random_state = 42).fit(X_train, y_train)

In [30]:
y_pred = rf_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

374.488300264767

# Model Tuning

In [None]:
df.shape

In [None]:
?rf_model

In [31]:
rf_params = {"max_depth": [5,8,10,None],
            "max_features": [2,5,10,15,17],
            "n_estimators": [100,200, 500, 1000],
            "min_samples_split": [2,5,10,20,30]}

In [32]:
rf_cv_model = GridSearchCV(rf_model, rf_params, cv = 10, n_jobs = -1, verbose = 2).fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   50.7s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 1434 tasks      | elapsed:  4.9min


KeyboardInterrupt: 

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestRegressor(**rf_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
rf_tuned.feature_importances_

In [None]:
Importance = pd.DataFrame({'Importance':rf_tuned.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', )

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

# Feature Engineering

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
rf_params = {"max_depth": [5,8,None],
            "max_features": [2,3,4,5],
            "n_estimators": [100,200, 500],
            "min_samples_split": [2,5,7]}

In [None]:
rf_model = RandomForestRegressor(random_state = 42).fit(X, y)

In [None]:
rf_cv_model = GridSearchCV(rf_model, rf_params, cv = 5, n_jobs = -1, verbose = 2).fit(X, y)

In [None]:
rf_cv_model.best_params_

In [None]:
rf_tuned = RandomForestRegressor(**rf_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = rf_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({'Importance':rf_tuned.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', )

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

# GBM

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
gbm_model = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
gbm_model

In [None]:
y_pred = gbm_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

# Model Tuning

In [None]:
gbm_params = {"learning_rate": [0.001,0.1,0.01, 0.05],
             "max_depth": [3,5,8,9,10],
             "n_estimators": [200,500,1000,1500],
             "subsample": [1,0.4,0.5,0.7],
             "loss": ["ls","lad","quantile"]}

In [None]:
gbm_model = GradientBoostingRegressor().fit(X_train, y_train)

In [None]:
gbm_cv_model = GridSearchCV(gbm_model, 
                            gbm_params, 
                            cv = 10, 
                            n_jobs=-1, 
                            verbose = 2).fit(X, y)

In [None]:
gbm_cv_model.best_params_

In [None]:
?gbm_tuned

In [None]:
gbm_tuned = GradientBoostingRegressor(**gbm_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({'Importance':gbm_tuned.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', )

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

# XGBoost

In [None]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
#!pip install xgboost

In [35]:
import xgboost
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor().fit(X_train, y_train)
y_pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

# Model Tuning

In [None]:
xgb_params = {"learning_rate": [0.1,0.01,0.5],
             "max_depth": [2,3,4,5,8],
             "n_estimators": [100,200,500,1000],
             "colsample_bytree": [0.4,0.7,1]}

In [None]:
xgb_cv_model  = GridSearchCV(xgb,xgb_params, cv = 10, n_jobs = -1, verbose = 2).fit(X_train, y_train)

In [None]:
xgb_cv_model.best_params_

In [None]:
xgb_tuned = XGBRegressor(**xgb_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = xgb_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({'Importance':xgb_tuned.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', )

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

# LightGBM

In [1]:
print("asda")

asda


In [4]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [None]:
#!pip install lightgbm
#conda install -c conda-forge lightgbm

In [36]:
from lightgbm import LGBMRegressor

In [6]:
lgb_model = LGBMRegressor().fit(X_train, y_train)
lgb_model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [7]:
y_pred = lgb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

380.3410406767106

# Model Tuning

In [8]:
lgb_model = LGBMRegressor()
lgb_model

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [9]:
lgbm_params = {"learning_rate": [0.01,0.001, 0.1, 0.5, 1],
              "n_estimators": [200,500,1000,5000],
              "max_depth": [2,4,6,7,10],
              "colsample_bytree": [1,0.8,0.5,0.4]}

In [10]:
lgbm_cv_model = GridSearchCV(lgb_model, 
                             lgbm_params, 
                             cv = 10, 
                             n_jobs = -1, 
                             verbose =2).fit(X_train, y_train)

Fitting 10 folds for each of 400 candidates, totalling 4000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    2.2s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 341 tasks      | elapsed:   31.9s
[Parallel(n_jobs=-1)]: Done 624 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 989 tasks      | elapsed:  2.4min


KeyboardInterrupt: 

In [None]:
lgbm_cv_model.best_params_

In [None]:
lgbm_tuned = LGBMRegressor(**lgbm_cv_model.best_params_).fit(X_train, y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
Importance = pd.DataFrame({'Importance':lgbm_tuned.feature_importances_*100}, 
                          index = X_train.columns)


Importance.sort_values(by = 'Importance', 
                       axis = 0, 
                       ascending = True).plot(kind = 'barh', 
                                              color = 'r', )

plt.xlabel('Variable Importance')
plt.gca().legend_ = None

In [None]:
?lgbm_tuned

# CatBoost

In [11]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [37]:
#!pip install catboost
from catboost import CatBoostRegressor

In [13]:
catb_model = CatBoostRegressor(verbose = False).fit(X_train, y_train)

In [14]:
catb_params = {"iterations": [200,500,100],
              "learning_rate": [0.01,0.1],
              "depth": [3,6,8]}

In [15]:
catb_model = CatBoostRegressor()

In [16]:
catb_cv_model = GridSearchCV(catb_model, 
                           catb_params, 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 2).fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.2min finished


0:	learn: 413.9241929	total: 4.51ms	remaining: 447ms
1:	learn: 394.0019250	total: 8.51ms	remaining: 417ms
2:	learn: 376.2606550	total: 12.4ms	remaining: 400ms
3:	learn: 361.4465902	total: 15.2ms	remaining: 365ms
4:	learn: 346.6496018	total: 19ms	remaining: 361ms
5:	learn: 332.5461069	total: 22ms	remaining: 345ms
6:	learn: 321.3892942	total: 25.6ms	remaining: 340ms
7:	learn: 311.7020404	total: 29.6ms	remaining: 340ms
8:	learn: 302.0717049	total: 34ms	remaining: 344ms
9:	learn: 292.7795244	total: 37.4ms	remaining: 337ms
10:	learn: 284.0375993	total: 40.4ms	remaining: 327ms
11:	learn: 278.1051858	total: 44.1ms	remaining: 323ms
12:	learn: 272.3691647	total: 48ms	remaining: 321ms
13:	learn: 267.1009049	total: 52.2ms	remaining: 320ms
14:	learn: 261.8874683	total: 55.7ms	remaining: 316ms
15:	learn: 257.9669773	total: 59.8ms	remaining: 314ms
16:	learn: 252.1146174	total: 62.4ms	remaining: 304ms
17:	learn: 247.7026406	total: 65ms	remaining: 296ms
18:	learn: 243.0233052	total: 67.5ms	remaining: 

In [17]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 100, 'learning_rate': 0.1}

In [18]:
catb_tuned = CatBoostRegressor(**catb_cv_model.best_params_).fit(X_train, y_train)

0:	learn: 413.9241929	total: 2.86ms	remaining: 283ms
1:	learn: 394.0019250	total: 5.34ms	remaining: 262ms
2:	learn: 376.2606550	total: 7.53ms	remaining: 244ms
3:	learn: 361.4465902	total: 9.28ms	remaining: 223ms
4:	learn: 346.6496018	total: 11.4ms	remaining: 217ms
5:	learn: 332.5461069	total: 13.8ms	remaining: 216ms
6:	learn: 321.3892942	total: 15.7ms	remaining: 209ms
7:	learn: 311.7020404	total: 17.4ms	remaining: 200ms
8:	learn: 302.0717049	total: 19.6ms	remaining: 198ms
9:	learn: 292.7795244	total: 22.2ms	remaining: 200ms
10:	learn: 284.0375993	total: 24.4ms	remaining: 198ms
11:	learn: 278.1051858	total: 26.4ms	remaining: 194ms
12:	learn: 272.3691647	total: 27.5ms	remaining: 184ms
13:	learn: 267.1009049	total: 29ms	remaining: 178ms
14:	learn: 261.8874683	total: 30.7ms	remaining: 174ms
15:	learn: 257.9669773	total: 32.5ms	remaining: 171ms
16:	learn: 252.1146174	total: 34.1ms	remaining: 166ms
17:	learn: 247.7026406	total: 35.8ms	remaining: 163ms
18:	learn: 243.0233052	total: 37.4ms	rem

In [19]:
np.sqrt(mean_squared_error(y_test, y_pred))

380.3410406767106

In [None]:
#1. temel modeller ve hatalarına
#2. model hiperparametre tuning
#3. final modelleri
#4. model optimizasyonu (feature engineer, aykırı, eksik, voting)


# Tum Modeller

In [21]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
df["NEW_catbat"] = df["CAtBat"]/df["Years"]
df["NEW_chits"] =df["CHits"]/df["Years"]
df["NEW_chmrun"] = df["CHmRun"]/df["Years"]
df["NEW_cruns"] = df["CRuns"]/df["Years"]
df["NEW_crbi"] = df["CRBI"]/df["Years"]
df["NEW_cwalks"]=df["CWalks"]/df["Years"]

y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [22]:
def compML(df, y, alg):
    model = alg().fit(X_train, y_train)
    y_pred = model.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
    model_ismi = alg.__name__
    print(model_ismi, "Modeli Test Hatası:", RMSE)

In [23]:
compML(df, "Salary", SVR)

SVR Modeli Test Hatası: 505.2463160726468


In [25]:
models = [LGBMRegressor,
          GradientBoostingRegressor, 
          RandomForestRegressor, 
          DecisionTreeRegressor,
          MLPRegressor,
          KNeighborsRegressor, 
          SVR]

In [26]:
for model in models:
    compML(df, "Salary", model)

LGBMRegressor Modeli Test Hatası: 380.3410406767106
GradientBoostingRegressor Modeli Test Hatası: 353.90403695940233
RandomForestRegressor Modeli Test Hatası: 380.63908364326045
DecisionTreeRegressor Modeli Test Hatası: 412.205836049748
MLPRegressor Modeli Test Hatası: 441.9118857490756
KNeighborsRegressor Modeli Test Hatası: 441.85241391457896
SVR Modeli Test Hatası: 505.2463160726468


# ikinci yol

In [46]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.filterwarnings("ignore", category=FutureWarning) 
warnings.filterwarnings("ignore", category=UserWarning) 


In [50]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR
from sklearn.model_selection import cross_val_score

In [54]:
models = []
models.append(('LR', LinearRegression()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('CART', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('SVR', SVR()))
models.append(('GBM', GradientBoostingRegressor()))
models.append(("XGBoost", XGBRegressor()))
models.append(("LightGBM", LGBMRegressor()))
models.append(("CatBoost", CatBoostRegressor(verbose = False)))


In [47]:
for name, model in models:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        msg = "%s: (%f)" % (name, rmse)
        print(msg)

LR: (479.071504)
KNN: (438.757154)
CART: (409.286840)
RF: (374.277263)
SVR: (505.223191)
GBM: (355.468736)
XGBoost: (356.612142)
LightGBM: (387.914002)
CatBoost: (373.653934)


In [48]:
for name, model in models:
        model.fit(X, y)
        y_pred = model.predict(X)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        msg = "%s: (%f)" % (name, rmse)
        print(msg)

LR: (303.344473)
KNN: (258.888948)
CART: (0.000000)
RF: (110.383336)
SVR: (450.749430)
GBM: (65.268048)
XGBoost: (80.465031)
LightGBM: (131.236048)
CatBoost: (20.753093)


In [51]:
for name, model in models:
        rmse = np.sqrt(np.mean(-cross_val_score(model, X, y, cv = 10, scoring = "neg_mean_squared_error")))
        msg = "%s: (%f)" % (name, rmse)
        print(msg)

LR: (341.465977)
KNN: (333.396861)
CART: (376.331451)
RF: (278.324691)
SVR: (451.277935)
GBM: (289.917675)
XGBoost: (294.878437)
LightGBM: (292.725407)
CatBoost: (277.973144)


In [None]:
#tum modelleri tune et. Tune edilmiş modelleri kullanarak eksik, aykırı, standardizasyon, yeni_degisken kombinasyonlarını