<a href="https://colab.research.google.com/github/min02yam/project_2022_03/blob/main/2_modeling_sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
train = pd.read_csv("/content/drive/MyDrive/2022_03/data/train_final.csv")
test = pd.read_csv("/content/drive/MyDrive/2022_03/data/test_final.csv")
submission = pd.read_csv("/content/drive/MyDrive/2022_03/data/sample_submission.csv")

feature = train.drop(['predicted_weight_g'],axis=1).columns
target = 'predicted_weight_g'

In [None]:
import random
import os
import numpy as np
def set_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
set_seed(49)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV 

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, RANSACRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor,GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
#from catboost import CatBoostRegressor

pip = Pipeline((("preprocess", MinMaxScaler()), ("regressor", LinearRegression()))) 
grid_parm = [
    
    {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [LinearRegression()]},
     
     {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [Ridge()],
      'regressor__alpha': [0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]},
     
     {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [Lasso()]},

    {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [ElasticNet()],
     'regressor__alpha' : np.arange(1e-4,1e-3,1e-4).tolist(),
     'regressor__l1_ratio': np.arange(0.1,1.0,0.1).tolist(),
     'regressor__max_iter':[100000]

    {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [KNeighborsRegressor()],
     'regressor__n_neighbors': list(range(1,25,2))},
     
     {'preprocess': [StandardScaler(), MinMaxScaler()],
     'regressor': [SVR()],
     'regressor__kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
     
     {'preprocess':[None],
      'regressor': [DecisionTreeRegressor()],
      'regressor__max_depth': list(range(2,15)),
      'regressor__max_leaf_nodes': [5, 50, 500, 5000]},
    
     {'preprocess':[None],
      'regressor': [RandomForestRegressor()],
      'regressor__n_estimators': [50,100,150]},
      
      {'preprocess':[None],
      'regressor': [AdaBoostRegressor()],
      'regressor__n_estimators': [50,100,150]},     

      {'preprocess':[None],
      'regressor': [GradientBoostingRegressor()],
      'regressor__n_estimators': [50,100,150]},
      
     {'preprocess':[None],
      'regressor': [XGBRegressor()],
      'regressor__objective' : ['reg:squarederror'],
      'regressor__n_estimators': list(range(100, 3600,100)),
      'regressor__max_depth': list(range(2,10)),
      'regressor__eta' :  list(np.arange(1e-4,1e-3,1e-4)),
      'regressor__subsample' : [0.5,0.6,0.7,0.8]},

     {'preprocess':[None],
      'regressor': [LGBMRegressor()]},

     {'preprocess':[StandardScaler(), MinMaxScaler()],
      'regressor': [MLPRegressor()],
      'regressor__hidden_layer_sizes': [(100,),(100,100),(100,100,100)],  # 세번째, 히든레이어 세개, 노드개수 100개
      'regressor__max_iter' :[1000]}      

]

from sklearn.inspection import permutation_importance 
from sklearn.metrics import mean_squared_error, make_scorer
def root_mean_squared_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return mean_squared_error(y_test, y_pred,squared=False )

grid_m = GridSearchCV(pip, 
                      grid_parm, 
                      cv = 5,
                      return_train_score=True, 
                      verbose=3,
                      scoring = root_mean_squared_error)

grid_m.fit(train[feature], train[target])

In [None]:
grid_m.best_estimator_

Pipeline(steps=[('preprocess', MinMaxScaler()),
                ('regressor',
                 MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=1000))])

In [None]:
best_medel = grid_m.best_estimator_
pred = best_medel.predict(test[feature])

submission['predicted_weight_g'] = pred
submission['case'] = test['case']

submission.to_csv('mlp_10_0.csv', index=False)
submission.head()

In [None]:
from sklearn.inspection import permutation_importance 
from sklearn.metrics import mean_squared_error, make_scorer

# RMSE
def root_mean_squared_error(y_test, y_pred):
    y_test, y_pred = np.array(y_test), np.array(y_pred)
    return mean_squared_error(y_test, y_pred,squared=False )


result = permutation_importance(best_medel, train[feature], train[target], scoring = make_scorer(root_mean_squared_error,greater_is_better=False),
                            n_repeats=30)
# Feature label
Feature = train[feature] 
sorted_result = result.importances_mean.argsort()

# DataFrame
importances = pd.DataFrame(result.importances_mean[sorted_result], index=Feature.columns[sorted_result]).sort_values(0, ascending=False)   
importances

In [None]:
grid_m.best_params_

In [None]:
best_medel = grid_m.best_estimator_

In [None]:
grid_m.best_index_

In [None]:
param_grid 

{'alpha': array([0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008,
        0.0009]),
 'l1_ratio': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]),
 'max_iter': [100000]}

In [None]:
grid_m.best_params_

In [None]:
grid_m.score

In [None]:
grid_m.best_score_

In [None]:
grid_m.cv_results_