In [128]:
from catboost import CatBoostRegressor
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_percentage_error
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")

In [129]:
df = pd.read_excel('linktera_veri_bilimi_veri_seti.xlsx')
df.head()

Unnamed: 0,Date,Otomotiv Satis,OTV Orani,Faiz,EUR/TL,Kredi Stok
0,2010-01-01,20095.0,37.0,13.182,2.107385,341244.126
1,2010-02-01,31172.0,37.0,13.2725,2.07389,351940.949
2,2010-03-01,51769.0,37.0,12.7275,2.085522,361307.32
3,2010-04-01,54946.0,37.0,12.012,2.00901,373575.34
4,2010-05-01,59377.0,37.0,11.74,1.94887,387708.051


In [130]:
# Subplot figürünü oluştur
fig = make_subplots(rows=4, cols=1, shared_xaxes=True)

# İlk subplotu ekle: Otomotiv Satış
fig.add_trace(go.Scatter(x=df['Date'], y=df['Otomotiv Satis'], name='Otomotiv Satışı'), row=1, col=1)

# İkinci subplotu ekle: OTV Oranı
fig.add_trace(go.Scatter(x=df['Date'], y=df['OTV Orani'], name='OTV Oranı'), row=2, col=1)

# Üçüncü subplotu ekle: Faiz
fig.add_trace(go.Scatter(x=df['Date'], y=df['Faiz'], name='Faiz'), row=3, col=1)

# Dördüncü subplotu ekle: EUR/TL
fig.add_trace(go.Scatter(x=df['Date'], y=df['EUR/TL'], name='EUR/TL'), row=4, col=1)

# Grafik düzenleme
fig.update_layout(title='Otomotiv Verileri', height=1000)

# Grafikleri göster
fig.show()

# Feature Engineering Part

In [131]:
def datetime_features(df_temp):
    """
    Datetime feature üretir.
    """
    df_temp['month'] = df_temp['Date'].dt.month
    df_temp['year'] = df_temp['Date'].dt.year
    df_temp['dayofweek'] = df_temp['Date'].dt.dayofweek
    df_temp['quarter'] = df_temp['Date'].dt.quarter
    df_temp['dayofmonth'] = df_temp['Date'].dt.day
    df_temp['weekofyear'] = df_temp['Date'].dt.weekofyear
    return df_temp

def seasonality_features(df_temp):
    df_temp['month_sin'] = np.sin(2*np.pi*df_temp.month/12)
    df_temp['month_cos'] = np.cos(2*np.pi*df_temp.month/12)
    
    return df_temp

df = datetime_features(df)
df = seasonality_features(df)
df.head()

Unnamed: 0,Date,Otomotiv Satis,OTV Orani,Faiz,EUR/TL,Kredi Stok,month,year,dayofweek,quarter,dayofmonth,weekofyear,month_sin,month_cos
0,2010-01-01,20095.0,37.0,13.182,2.107385,341244.126,1,2010,4,1,1,53,0.5,0.8660254
1,2010-02-01,31172.0,37.0,13.2725,2.07389,351940.949,2,2010,0,1,1,5,0.866025,0.5
2,2010-03-01,51769.0,37.0,12.7275,2.085522,361307.32,3,2010,0,1,1,9,1.0,6.123234000000001e-17
3,2010-04-01,54946.0,37.0,12.012,2.00901,373575.34,4,2010,3,2,1,13,0.866025,-0.5
4,2010-05-01,59377.0,37.0,11.74,1.94887,387708.051,5,2010,5,2,1,17,0.5,-0.8660254


# Train-Test

In [132]:
train = df.iloc[:149]
submission_df = df.iloc[149:]

In [133]:
y = train['Otomotiv Satis']
X = train.drop(columns = ['Date','Otomotiv Satis'])

In [134]:
feature_list = ['OTV Orani','Faiz','EUR/TL','Kredi Stok','month','year','dayofweek','quarter','dayofmonth','weekofyear','month_sin','month_cos']

# Catboost Model

In [135]:
params = {'learning_rate': 0.03,
          'objective':'MAE',
          'depth': 3,
          'early_stopping_rounds':100,
          'iterations': 1000,
          'eval_metric': "MAPE"}

In [136]:
def catboost_trainer(X,
                     y,
                     submission_df,
                     cv,
                     model_params,
                     feature_list,
                     scorer,
                     target_transform=False):
    """
    Catboost Trainer.
    
    ---------
    :param X: training data
    :param y: target
    :param submission_df: test dataframe to be predicted
    :param cv: scikitlearn cross validation object
    :param model_params: dict of catboost model parameters
    :param feature_list: used list of features for training and inference
    :param cat_features: categorical features
    :param scorer: scikitlearn evaluation metric
    :param target_transform: Target
    :return: CV score list, models and submission predictions 
    """
    
    score_list = []
    fold = 1
    unseen_preds = []
    importance = []
    val_results = []
    train_results = []
    models = []
    
    for train_index, test_index in cv.split(X):
        X_train,X_val = X.iloc[train_index][feature_list],X.iloc[test_index][feature_list]
        y_train,y_val = y.iloc[train_index],y.iloc[test_index]
        sub_df_subset = submission_df[feature_list]
        print(f"Training data shape: {X_train.shape}, Validation data shape: {X_val.shape}")
        
        if target_transform:
            y_train = np.log1p(y_train)
            y_val = np.log1p(y_val)
        
        
        model = CatBoostRegressor(**model_params)
        model.fit(X_train,y_train,
                eval_set=[(X_val,y_val)],
                verbose=500)
        models.append(model)
        forecast_pred = model.predict(sub_df_subset)
        if target_transform:
            forecast_pred = np.expm1(forecast_pred)
        unseen_preds.append(forecast_pred)
        
        val_result = model.predict(X_val)
        if target_transform:
            val_result = np.expm1(val_result)
        
        train_result = model.predict(X_train)
        if target_transform:
            train_result = np.expm1(train_result)
            
        train_results.append(train_result)
        if target_transform:    
            y_train = np.expm1(y_train)
            y_val = np.expm1(y_val)
            
        score = scorer(y_val,val_result)
        score_t = scorer(y_train,train_result)
        
        print(f"Score FOLD-{fold}:{score}")
        print(f"Score Train FOLD-{fold}:{score_t}")
        print(f'Predicted Mean:{np.mean(forecast_pred)}')
        score_list.append(score)
        importance.append(model.get_feature_importance())
        fold += 1
        print('*'*50)
    model.save_model("linktera_case_model.cbm")
    print("Mean MAPE:", np.mean(score_list),"Std MAPE:",np.std(score_list))
    return score_list, models, unseen_preds

In [137]:
score_list_stage_one_cat, models_stage_one_cat, unseen_preds_stage_one_cat = catboost_trainer(X=X,
                                                                                      y=y,
                                                                                      submission_df=submission_df,
                                                                                      cv=TimeSeriesSplit(n_splits=3,test_size=12),
                                                                                      model_params=params,
                                                                                      feature_list=feature_list,
                                                                                      scorer=mean_absolute_percentage_error,
                                                                                      target_transform=False)

Training data shape: (113, 12), Validation data shape: (12, 12)
0:	learn: 0.3673425	test: 0.9003940	best: 0.9003940 (0)	total: 465us	remaining: 466ms
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3595488733
bestIteration = 203

Shrink model to first 204 iterations.
Score FOLD-1:0.35954887333074187
Score Train FOLD-1:0.0849314404086061
Predicted Mean:67212.71414266368
**************************************************
Training data shape: (125, 12), Validation data shape: (12, 12)
0:	learn: 0.3979424	test: 0.2213404	best: 0.2213404 (0)	total: 606us	remaining: 605ms
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.2164502064
bestIteration = 4

Shrink model to first 5 iterations.
Score FOLD-2:0.21645020642061064
Score Train FOLD-2:0.37355026886183623
Predicted Mean:63970.4539541164
**************************************************
Training data shape: (137, 12), Validation data shape: (12, 12)
0:	learn: 0.3843353	test: 0.1646374	best: 0.1646374 

# Feature Importance

In [138]:
importance = [model.get_feature_importance() for model in models_stage_one_cat]

f_importance = pd.concat([pd.Series(X[feature_list].columns.to_list(),name='Feature'),
                          pd.Series(np.mean(importance,axis=0),name="Importance")],
                         axis=1).sort_values(by='Importance',
                                             ascending=True)

fig = px.bar(f_importance.tail(20),x='Importance',y='Feature')
fig.update_layout(
    title_text="First 20 Important Features - CatBoost Average of Folds"
)
fig.show()

# Predict Result

In [139]:
submission_df['Otomotiv Satis'] = np.mean(np.concatenate([unseen_preds_stage_one_cat]),axis=0)
df.iloc[149:,1:2] = submission_df['Otomotiv Satis'].values

In [140]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=submission_df['Date'],y=submission_df['Otomotiv Satis'],mode='lines'))

In [141]:
fig1 = go.Figure()
fig1.add_trace(go.Scatter(x=df['Date'],y=df['Otomotiv Satis'],mode='lines'))