In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor
from catboost import CatBoostRegressor
from notifiers import get_notifier

In [2]:
def new_time_features(df, key_column):
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['rolling_mean'] = df[key_column].shift(7).rolling(7).mean()
    df['lag_7'] = df[key_column].shift(7)
    df['lag_15'] = df[key_column].shift(15)
    df['lag_30'] = df[key_column].shift(30)
    df['lag_90'] = df[key_column].shift(90)
    df['lag_180'] = df[key_column].shift(180)
    return df

In [3]:
def search_best_model(model, model_name, params, X, y):
    print('<<<-------------------...Searching for best model...------------------->>>')
    clf = GridSearchCV(model, params, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=2)
    clf.fit(X, y)
    score = -1 * clf.best_score_
    print('------------------->>>Best model found successfully!<<<-------------------')
    print('\nBest {} MAE:'.format(model_name), score, '\nBest {} model:'.format(model_name), clf.best_estimator_)
    return score, clf.best_estimator_

In [5]:
def test_and_score(model):
    model.fit(X, y)
    return mean_absolute_error(y_test, model.predict(X_test))

In [6]:
def make_predictions(df, model):
    df_pred = df.iloc[-7:]
    X_pred = df_pred.drop('temperature', axis=1)
    preds = best_cb_model.predict(X_pred)
    
    columns_to_drop = ['month', 'day', 'rolling_mean', 'lag_7',
                       'lag_15', 'lag_30', 'lag_90', 'lag_180']
    X_pred = X_pred.copy()
    X_pred['День недели'] = X_pred.index.day_name()
    X_pred['Температура'] = preds
    X_pred['Температура'] = X_pred['Температура'].round(1)
    X_pred = X_pred.drop(columns_to_drop, axis=1)
    
    days_week_rus = {
        'Monday':'Понедельник', 'Tuesday':'Вторник', 'Wednesday':'Среда',
        'Thursday':'Четверг', 'Friday':'Пятница',
        'Saturday':'Суббота', 'Sunday':'Воскресенье'
    }
    X_pred['День недели'] = X_pred['День недели'].replace(days_week_rus)
    return X_pred

In [7]:
def telegram_notifier_to_channel(
    token='<telegram-bot token>',
    chat_id='@<channel name>'
):
    def f(text):
        telegram = get_notifier('telegram')
        telegram.notify(
            message=text,
            token=token,
            chat_id=chat_id
        )
    return f

In [8]:
def telegram_notifier_to_bot(
    token='<telegram-bot token>',
    chat_id='<chat with bot id>'
):
    def f(text):
        telegram = get_notifier('telegram')
        telegram.notify(
            message=text,
            token=token,
            chat_id=chat_id
        )
    return f

In [9]:
df_all = pd.read_csv('Datasets/weather/weather.csv', index_col=[0], parse_dates=[0], dayfirst=True)

In [10]:
df_all

Unnamed: 0,temperature,month,day,rolling_mean,lag_7,lag_15,lag_30,lag_90,lag_180
2005-07-31,27.6,7,31,24.900000,26.2,28.2,17.2,8.9,-6.6
2005-08-01,26.1,8,1,25.414286,27.2,27.2,15.6,10.0,-8.0
2005-08-02,25.7,8,2,25.328571,23.2,23.6,21.3,16.0,-10.6
2005-08-03,23.8,8,3,25.128571,24.1,23.8,21.9,20.0,-8.6
2005-08-04,22.7,8,4,24.742857,23.3,25.5,23.2,20.2,-8.1
...,...,...,...,...,...,...,...,...,...
2021-08-30,22.8,8,30,24.271429,20.2,26.2,25.8,16.6,3.2
2021-08-31,25.2,8,31,22.728571,19.3,27.6,25.2,20.6,2.5
2021-09-01,19.9,9,1,21.100000,19.2,30.1,29.7,20.4,-1.7
2021-09-02,13.4,9,2,20.485714,19.3,30.6,22.8,22.9,-4.2


In [11]:
X = df_all.drop('temperature', axis=1)
y = df_all['temperature']
print('Features: {}\nTarget: {}'.format(X.shape, y.shape))

Features: (5879, 8)
Target: (5879,)


In [12]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.05, shuffle=False)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.055, shuffle=False)
print('Features', X_train.shape, X_valid.shape, X_test.shape, sep='  |  ')
print('Target ', y_train.shape, y_valid.shape, y_test.shape, sep='   |   ')
print('Ratio',
      round(len(X_train) / len(df_all) * 100),
      round(len(X_valid) / len(df_all) * 100),
      round(len(X_test) / len(df_all) * 100), sep='     |     ')

Features  |  (5277, 8)  |  (308, 8)  |  (294, 8)
Target    |   (5277,)   |   (308,)   |   (294,)
Ratio     |     90     |     5     |     5


In [13]:
best_cb_score, best_cb_model = search_best_model(
    model=CatBoostRegressor(),
    model_name='CatBoost',
    params={'iterations': range(1000, 3100, 100), 'verbose': [0],
            'random_state': [555], 'loss_function': ['MAE']},
    X=X, y=y
)

<<<-------------------...Searching for best model...------------------->>>
Fitting 5 folds for each of 21 candidates, totalling 105 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 105 out of 105 | elapsed:  5.5min finished


------------------->>>Best model found successfully!<<<-------------------

Best CatBoost MAE: 4.112344137664333 
Best CatBoost model: <catboost.core.CatBoostRegressor object at 0x7f88a89f2be0>


In [14]:
send_bot = telegram_notifier_to_bot()
send_bot('Я обучился! Муррр!')

In [16]:
display(
    pd.DataFrame(data=([best_cb_score, test_and_score(best_cb_model)],
                       ['--------', test_and_score(DummyRegressor())]),
                 columns=['Valid MAE', 'Test MAE'],
                 index=['CatBoost', 'Dummy'])
)

Unnamed: 0,Valid MAE,Test MAE
CatBoost,4.11234,4.376287
Dummy,--------,12.216752


In [17]:
send_bot(
    'Мои успехи:\nValid MAE: {:.2f}\nTest MAE: {:.2f}'
    .format(best_cb_score, test_and_score(best_cb_model))
)

In [18]:
X = df_all.drop('temperature', axis=1)
y = df_all['temperature']
print('Features: {}\nTarget: {}'.format(X.shape, y.shape))

Features: (5879, 8)
Target: (5879,)


In [19]:
best_cb_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7f88a89f2be0>

In [20]:
next_7_dates = pd.Series(pd.date_range(df_all.index[-1], periods=7, freq='D').shift().normalize())
next_7_dates

0   2021-09-04
1   2021-09-05
2   2021-09-06
3   2021-09-07
4   2021-09-08
5   2021-09-09
6   2021-09-10
dtype: datetime64[ns]

In [21]:
df_all_next = pd.DataFrame(index=next_7_dates, columns=df_all.columns)
df_all = pd.concat([df_all, df_all_next])
df_all.tail(10)

Unnamed: 0,temperature,month,day,rolling_mean,lag_7,lag_15,lag_30,lag_90,lag_180
2021-09-01,19.9,9.0,1.0,21.1,19.2,30.1,29.7,20.4,-1.7
2021-09-02,13.4,9.0,2.0,20.485714,19.3,30.6,22.8,22.9,-4.2
2021-09-03,9.9,9.0,3.0,20.071429,22.0,23.6,23.3,22.6,-2.0
2021-09-04,,,,,,,,,
2021-09-05,,,,,,,,,
2021-09-06,,,,,,,,,
2021-09-07,,,,,,,,,
2021-09-08,,,,,,,,,
2021-09-09,,,,,,,,,
2021-09-10,,,,,,,,,


In [22]:
df_all = new_time_features(df_all, 'temperature')
df_all.tail(10)

Unnamed: 0,temperature,month,day,rolling_mean,lag_7,lag_15,lag_30,lag_90,lag_180
2021-09-01,19.9,9,1,21.1,19.2,30.1,29.7,20.4,-1.7
2021-09-02,13.4,9,2,20.485714,19.3,30.6,22.8,22.9,-4.2
2021-09-03,9.9,9,3,20.071429,22.0,23.6,23.3,22.6,-2.0
2021-09-04,,9,4,20.085714,21.0,24.9,22.5,23.0,-4.9
2021-09-05,,9,5,20.528571,22.7,20.9,28.0,20.8,-9.5
2021-09-06,,9,6,20.9,22.8,19.6,25.1,21.5,-12.5
2021-09-07,,9,7,21.742857,25.2,20.2,23.3,19.0,-6.5
2021-09-08,,9,8,21.842857,19.9,19.3,28.5,16.2,-5.1
2021-09-09,,9,9,21.0,13.4,19.2,28.1,17.4,-2.1
2021-09-10,,9,10,19.271429,9.9,19.3,21.8,21.7,2.7


In [23]:
predictions_temperature = make_predictions(df_all, best_cb_model)
predictions_temperature

Unnamed: 0,День недели,Температура
2021-09-04,Суббота,16.9
2021-09-05,Воскресенье,17.2
2021-09-06,Понедельник,17.8
2021-09-07,Вторник,19.2
2021-09-08,Среда,18.9
2021-09-09,Четверг,17.4
2021-09-10,Пятница,15.5


In [24]:
predictions_text = ('Предсказания на неделю:'
                    '\nc "{}" по "{}"\n'
                    '\nТемпература | День недели'
                    '\n{}            {}'
                    '\n{}            {}'
                    '\n{}            {}'
                    '\n{}            {}'
                    '\n{}            {}'
                    '\n{}            {}'
                    '\n{}            {}'
                    .format(predictions_temperature.index[0].date(),
                            predictions_temperature.index[6].date(),
                            predictions_temperature.iloc[0][1],
                            predictions_temperature.iloc[0][0],
                            predictions_temperature.iloc[1][1],
                            predictions_temperature.iloc[1][0],
                            predictions_temperature.iloc[2][1],
                            predictions_temperature.iloc[2][0],
                            predictions_temperature.iloc[3][1],
                            predictions_temperature.iloc[3][0],
                            predictions_temperature.iloc[4][1],
                            predictions_temperature.iloc[4][0],
                            predictions_temperature.iloc[5][1],
                            predictions_temperature.iloc[5][0],
                            predictions_temperature.iloc[6][1],
                            predictions_temperature.iloc[6][0])
                   )

In [25]:
send_channel = telegram_notifier_to_channel()
send_channel(predictions_text)