# Setting

## Install

In [1]:
!pip3 install catboost

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
!pip3 install seaborn

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [3]:
!pip3 install tqdm

You should consider upgrading via the '/Library/Frameworks/Python.framework/Versions/3.10/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

## 라이브러리

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold

from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm

# Load Data

In [5]:
df_submission_path = '/Users/namwoo/Desktop/jeju/sample_submission.csv'
df_train_path = '//Users/namwoo/Desktop/jeju/df_train_V2.csv'
df_test_path = '/Users/namwoo/Desktop/jeju/df_test_V2.csv'

In [6]:
df_train = pd.read_csv(df_train_path)
df_test = pd.read_csv(df_test_path)

In [7]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 291241 entries, 0 to 291240
Data columns (total 37 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   Unnamed: 0             291241 non-null  int64 
 1   base_date              291241 non-null  int64 
 2   day_of_week            291241 non-null  object
 3   base_hour              291241 non-null  object
 4   lane_count             291241 non-null  object
 5   road_rating            291241 non-null  object
 6   multi_linked           291241 non-null  object
 7   connect_code           291241 non-null  object
 8   road_type              291241 non-null  object
 9   maximum_speed_limit    291241 non-null  object
 10  weight_restricted      291241 non-null  object
 11  start_node_name        291241 non-null  object
 12  end_node_name          291241 non-null  object
 13  start_latitude         291241 non-null  object
 14  end_latitude           291241 non-null  object
 15  

# Preprocessing

In [8]:
df_train.drop(columns='Unnamed: 0', inplace=True)
df_test.drop(columns='Unnamed: 0', inplace=True)

In [9]:
X = df_train.drop(columns='target')
y = df_train['target']

In [10]:
cat_cols = X.dtypes[X.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(cat_cols))

Number of Categorical features:  34


# Catboost

In [11]:
num_model = 0
n_KFold = 7

early_stopping_rounds = 100

In [14]:
CAT_models = {}

kf = KFold(n_splits=n_KFold, shuffle=True, random_state=2022)

for train_index, vaild_index in kf.split(X):

  print(f'\n----------------- {num_model} 번 모델 학습 -----------------\n')
  
  X_train, X_vaild = X.iloc[train_index], X.iloc[vaild_index]
  y_train, y_vaild = y.iloc[train_index], y.iloc[vaild_index]

  CAT = CatBoostRegressor(
                          # learning_rate=0.148147,
                          # feature_weights=feature__weights,
                          loss_function='MAE',
                          task_type='CPU',
                          use_best_model=True
                          )

  CAT.fit(X_train, y_train,
          eval_set=(X_vaild, y_vaild),
          cat_features=cat_cols,
          early_stopping_rounds=early_stopping_rounds,
          use_best_model=True,
          verbose=100)

  y_pred = CAT.predict(X_vaild)

  MAE = mean_absolute_error(y_pred, y_vaild)

  print(f'\n{num_model} 번 Model Vaild MAE : {MAE}\n')

  CAT_models[num_model] = CAT

  num_model += 1


----------------- 0 번 모델 학습 -----------------

0:	learn: 12.8996234	test: 12.8759129	best: 12.8759129 (0)	total: 4.34s	remaining: 1h 12m 20s
100:	learn: 5.0068548	test: 5.0189087	best: 5.0189087 (100)	total: 6m 9s	remaining: 54m 47s
200:	learn: 4.6545017	test: 4.6519716	best: 4.6519716 (200)	total: 12m 58s	remaining: 51m 34s
300:	learn: 4.4998382	test: 4.4922010	best: 4.4922010 (300)	total: 20m 22s	remaining: 47m 20s
400:	learn: 4.3882313	test: 4.3761815	best: 4.3761815 (400)	total: 28m 11s	remaining: 42m 6s
500:	learn: 4.2999188	test: 4.2845073	best: 4.2845073 (500)	total: 36m 32s	remaining: 36m 23s
600:	learn: 4.2332090	test: 4.2150930	best: 4.2150930 (600)	total: 44m 45s	remaining: 29m 42s


KeyboardInterrupt: 

# Submission

In [None]:
sample_submission = pd.read_csv(df_submission_path)

sample_submission.iloc[:,1:] = 0

In [None]:
ensemble_array = np.zeros(df_test.shape[0])

for n_model in tqdm(range(n_KFold)):

  ensemble_array += CAT_models[n_model].predict(df_test)/n_KFold

sample_submission['target'] = ensemble_array

In [None]:
sample_submission.to_csv("KFold_7_V2.csv", index = False)

In [None]:
sample_submission

# Feature Importance

In [None]:
def plot_feature_importance(importance,names,model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
plot_feature_importance(CAT_models[0].get_feature_importance(),df_test.columns,'CATBOOST')