# Doğrusal Olmayan Regresyon Modelleri

In [2]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

from warnings import filterwarnings
filterwarnings('ignore')

# KNN

In [3]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)

In [4]:
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,632,43,10,1,1,1
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,880,82,14,0,1,0
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,200,11,3,1,0,1
4,321,87,10,39,42,30,2,396,101,12,48,46,33,805,40,4,1,0,1
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,282,421,25,0,1,0


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [32]:
knn_model = KNeighborsRegressor().fit(X_train, y_train)

In [33]:
knn_model

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=20, p=2,
                    weights='uniform')

In [34]:
y_pred = knn_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

442.7695798487812

Hatırlatma

- train-test

- tum veri ile cv (gormediği verideki performans?)

- train-test (train uzerine cv, test ile test edilir)

# KNN Model Tuning

In [20]:
knn_model

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [40]:
RMSE = []

for k in range(20):
    k = k + 2
    knn_model = KNeighborsRegressor(n_neighbors = k).fit(X_train, y_train)
    y_pred = knn_model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    RMSE.append(rmse)
    print("k =", k, "için RMSE değeri:", rmse)

k = 2 için RMSE değeri: 402.3490108489193
k = 3 için RMSE değeri: 420.8851794272263
k = 4 için RMSE değeri: 433.042118061226
k = 5 için RMSE değeri: 438.75715389787524
k = 6 için RMSE değeri: 447.4667559140737
k = 7 için RMSE değeri: 453.52179196594597
k = 8 için RMSE değeri: 454.3240838534937
k = 9 için RMSE değeri: 459.34454670250665
k = 10 için RMSE değeri: 453.9051617388242
k = 11 için RMSE değeri: 446.22600056274575
k = 12 için RMSE değeri: 450.1945294370967
k = 13 için RMSE değeri: 451.09507598791004
k = 14 için RMSE değeri: 451.4981224280202
k = 15 için RMSE değeri: 453.8713948576225
k = 16 için RMSE değeri: 451.7888216207132
k = 17 için RMSE değeri: 452.35446754486134
k = 18 için RMSE değeri: 442.2032272885196
k = 19 için RMSE değeri: 442.68511238980847
k = 20 için RMSE değeri: 442.7695798487812
k = 21 için RMSE değeri: 440.6012781451951


In [41]:
#GridSearchCV

In [44]:
knn_params = {"n_neighbors": np.arange(2,30,1)}

knn_model = KNeighborsRegressor()

knn_cv_model = GridSearchCV(knn_model, knn_params, cv = 10).fit(X_train, y_train)

In [45]:
knn_cv_model.best_params_

{'n_neighbors': 4}

In [47]:
knn_tuned = KNeighborsRegressor(**knn_cv_model.best_params_).fit(X_train, y_train)

In [48]:
y_pred = knn_tuned.predict(X_test)

In [49]:
np.sqrt(mean_squared_error(y_test, y_pred))

433.042118061226

# SVR 

In [50]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [52]:
svr_model = SVR("linear").fit(X_train, y_train)

In [55]:
svr_model

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [56]:
y_pred = svr_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

488.55774927996595

# SVR Tuning

In [62]:
svr_model = SVR("linear") 

svr_params = {"C": [0.01,0.001, 0.2, 0.1,0.5,0.8,0.9,1]}

svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 5, n_jobs = -1, verbose =  2).fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  38 out of  40 | elapsed:   31.9s remaining:    1.7s
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   33.9s finished


In [63]:
svr_cv_model.best_params_

{'C': 0.001}

In [64]:
svr_tuned = SVR("linear", C = 0.001).fit(X_train, y_train)

In [65]:
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

488.7612678891111

In [66]:
?SVR

[0;31mInit signature:[0m
[0mSVR[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mkernel[0m[0;34m=[0m[0;34m'rbf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mdegree[0m[0;34m=[0m[0;36m3[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mgamma[0m[0;34m=[0m[0;34m'scale'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcoef0[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtol[0m[0;34m=[0m[0;36m0.001[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mC[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mepsilon[0m[0;34m=[0m[0;36m0.1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mshrinking[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcache_size[0m[0;34m=[0m[0;36m200[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mverbose[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_iter[0m[0;34m=[0m[0;34m-[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

In [76]:
#nonlinear
svr_model = SVR() 

svr_params = {"C": [0.01,0.001, 0.2, 0.1,0.5,0.8,0.9,1, 10, 100, 500,1000]}

svr_cv_model = GridSearchCV(svr_model, svr_params, cv = 5, n_jobs = -1, verbose =  2).fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    1.8s finished


In [77]:
svr_cv_model.best_params_

{'C': 1000}

In [78]:
svr_tuned = SVR(**svr_cv_model.best_params_).fit(X_train, y_train)

In [79]:
y_pred = svr_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

434.90979834824327

# Yapay Sinir Ağları

In [81]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [82]:
scaler = StandardScaler()

In [83]:
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [84]:
scaler.fit(X_test)
X_test_scaled = scaler.transform(X_test)

In [85]:
mlp_model = MLPRegressor().fit(X_train_scaled, y_train)

In [87]:
?mlp_model

[0;31mType:[0m        MLPRegressor
[0;31mString form:[0m
MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
           beta_2= <...> ,
           tol=0.0001, validation_fraction=0.1, verbose=False,
           warm_start=False)
[0;31mFile:[0m        ~/anaconda3/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py
[0;31mDocstring:[0m  
Multi-layer Perceptron regressor.

This model optimizes the squared-loss using LBFGS or stochastic gradient
descent.

.. versionadded:: 0.18

Parameters
----------
hidden_layer_sizes : tuple, length = n_layers - 2, default=(100,)
    The ith element represents the number of neurons in the ith
    hidden layer.

activation : {'identity', 'logistic', 'tanh', 'relu'}, default='relu'
    Activation function for the hidden layer.

    - 'identity', no-op activation, useful to implement linear bottleneck,
      returns f(x) = x

    - 'logistic', the logistic sigmoid function,
      returns f(x) = 1 / (1 

In [88]:
y_pred = mlp_model.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test, y_pred))

637.4452212655443

# Model Tuning

In [93]:
mlp_params = {"alpha": [0.1, 0.01, 0.02, 0.001, 0.0001], 
             "hidden_layer_sizes": [(10,20), (5,5), (100,100), (1000,100,10)]}

In [94]:
mlp_cv_model = GridSearchCV(mlp_model, mlp_params, cv = 10, verbose = 2, n_jobs = -1).fit(X_train_scaled, y_train)

Fitting 10 folds for each of 20 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   41.0s finished


In [95]:
mlp_cv_model.best_params_

{'alpha': 0.01, 'hidden_layer_sizes': (1000, 100, 10)}

In [96]:
mlp_tuned = MLPRegressor(**mlp_cv_model.best_params_).fit(X_train_scaled, y_train)

In [97]:
y_pred = mlp_tuned.predict(X_test_scaled)
np.sqrt(mean_squared_error(y_test, y_pred))

453.50312597546514

# CART

In [103]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
df = pd.get_dummies(df, columns = ['League', 'Division', 'NewLeague'], drop_first = True)
y = df["Salary"]
X = df.drop('Salary', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.20, 
                                                    random_state=46)

In [148]:
cart_model = DecisionTreeRegressor(random_state = 52)
cart_model.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=52, splitter='best')

In [149]:
#Bir fonksiyonun türevinin sıfır olduğu nokta eğitimidir, gradyanıdır, bu noktanın negatif yonu ilgili fonksiyonu azaltır.

In [150]:
y_pred = cart_model.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

407.7930907067807

In [139]:
#agac gelistirirken verinin 3'te 2'si train için 3'te 1'i başarı değerlendirme için kullanılır.

# Model Tuning

In [157]:
?cart_model

[0;31mType:[0m        DecisionTreeRegressor
[0;31mString form:[0m
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
           max_f <...> raction_leaf=0.0, presort='deprecated',
           random_state=None, splitter='best')
[0;31mFile:[0m        ~/anaconda3/lib/python3.7/site-packages/sklearn/tree/_classes.py
[0;31mDocstring:[0m  
A decision tree regressor.

Read more in the :ref:`User Guide <tree>`.

Parameters
----------
criterion : {"mse", "friedman_mse", "mae"}, default="mse"
    The function to measure the quality of a split. Supported criteria
    are "mse" for the mean squared error, which is equal to variance
    reduction as feature selection criterion and minimizes the L2 loss
    using the mean of each terminal node, "friedman_mse", which uses mean
    squared error with Friedman's improvement score for potential splits,
    and "mae" for the mean absolute error, which minimizes the L1 loss
    using the median of each terminal node.

    .. ver

In [151]:
cart_params = {"max_depth": [2,3,4,5,10,20, 100, 1000],
              "min_samples_split": [2,10,5,30,50,10]}

In [152]:
cart_model = DecisionTreeRegressor()

In [153]:
cart_cv_model = GridSearchCV(cart_model, cart_params, cv = 10).fit(X_train, y_train)

In [154]:
cart_cv_model.best_params_

{'max_depth': 4, 'min_samples_split': 50}

In [155]:
cart_tuned = DecisionTreeRegressor(**cart_cv_model.best_params_).fit(X_train, y_train)

In [147]:
y_pred = cart_tuned.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

400.66820692864223