In [1]:
%pip install -Uq upgini catboost

[K     |████████████████████████████████| 91 kB 4.8 MB/s 
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
[K     |████████████████████████████████| 1.6 MB 42.4 MB/s 
[K     |████████████████████████████████| 12.2 MB 60.4 MB/s 
[K     |████████████████████████████████| 2.0 MB 36.4 MB/s 
[?25h

In [2]:
from os.path import exists
import pandas as pd 

df_path="train.csv.zip" if exists("train.csv.zip") else "https://github.com/upgini/upgini/raw/main/notebooks/train.csv.zip"
df=pd.read_csv(df_path)
df=df.sample(n=19_000, random_state=0)
df["store"]=df["store"].astype(str)
df["item"]=df["item"].astype(str)

df["date"]=pd.to_datetime(df["date"])

df.sort_values("date", inplace=True)
df.reset_index(inplace=True, drop=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,7,5,5
1,2013-01-01,4,9,19
2,2013-01-01,1,33,37
3,2013-01-01,3,41,14
4,2013-01-01,5,24,26


In [3]:
#splitting the data set trainig and testing data sets 
train = df[df["date"]<"2017-01-01"]
test = df[df["date"] >="2017-01-01"]

In [4]:
#features and labels
train_features = train.drop(columns=["sales"])
train_target = train["sales"]
test_features = train.drop(columns=["sales"])
test_target = train["sales"]

In [5]:
#enrich the features with upgini 
from upgini import FeaturesEnricher, SearchKey
from upgini.metadata import CVType

enricher = FeaturesEnricher(
    search_keys={
        "date":SearchKey.DATE
    },
    cv = CVType.time_series
    
)
enricher.fit(train_features,
             train_target,
             eval_set=[(test_features, test_target)])

<IPython.core.display.Javascript object>

Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history
Detected task type: ModelTaskType.REGRESSION



Column name,Status,Errors
target,All valid,-
date,All valid,-



Running search request, search_id=ac0c2ee3-bf68-4d58-b5f6-35c4eef6af7e
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

[92m[1m
25 relevant feature(s) found with the search keys: ['date'][0m


Provider,Source,Feature name,SHAP value,Coverage %,Type,Feature type
,,item,0.488656,100.0,categorical,
,,store,0.172407,100.0,categorical,
Upgini,Public/Comm. shared,f_weather_date_weather_pca_0_d7e0a1fc,0.056252,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_sin1_847b5db1,0.047397,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_week_cos1_f6a8c1fc,0.030201,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_48_b39cd0c4,0.025656,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_24_2e14c9a6,0.018649,100.0,numerical,Free
Upgini,Public/Comm. shared,f_weather_date_weather_umap_33_89bb7578,0.015129,100.0,numerical,Free
Upgini,Public/Comm. shared,f_events_date_year_cos1_9014a856,0.013152,100.0,numerical,Free
Upgini,Public/Comm. shared,f_financial_date_silver_14e835ea,0.007449,100.0,numerical,Free


In [6]:
from catboost import CatBoostRegressor
from catboost.utils import eval_metric

model = CatBoostRegressor(verbose=False ,allow_writing_files=False,random_state=0)

enricher.calculate_metrics(

     estimator = model,
     scoring = "mean_absolute_percentage_error"
)

Calculating metrics...
Done


Unnamed: 0,Match rate,Baseline mean_absolute_percentage_error,Enriched mean_absolute_percentage_error,Uplift
,,,,
Train,100.0,0.254322,0.166509,0.087813
Eval 1,100.0,0.263614,0.154363,0.109251


In [7]:
enriched_train_features = enricher.transform(train_features, keep_input=True)
enriched_test_features = enricher.transform(test_features, keep_input=True)
enriched_train_features.head()


Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history


Column name,Status,Errors
date,All valid,-



Running search request, search_id=e8108be7-7e18-4b65-819f-fab5b5084e40
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Retrieving selected features from data sources...
Done
Try to add other keys like the COUNTRY, POSTAL_CODE, PHONE NUMBER, EMAIL/HEM, IPv4 to your training dataset
for search through all the available data sources.
See docs https://github.com/upgini/upgini#-total-239-countries-and-up-to-41-years-of-history


Column name,Status,Errors
date,All valid,-



Running search request, search_id=5b335bc6-2f6f-481a-a6c2-ea8f5291fec0
We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
Done

Retrieving selected features from data sources...
Done


Unnamed: 0,date,store,item,f_weather_date_weather_pca_0_d7e0a1fc,f_events_date_week_sin1_847b5db1,f_events_date_week_cos1_f6a8c1fc,f_weather_date_weather_umap_48_b39cd0c4,f_weather_date_weather_umap_24_2e14c9a6,f_weather_date_weather_umap_33_89bb7578,f_events_date_year_cos1_9014a856,...,f_events_date_italy_game_cnt_99570b80,f_financial_date_nasdaq_c568533e,f_financial_date_dow_jones_7d_to_7d_1y_shift_61f71e90,f_economic_date_cbpol_pca_3_27450634,f_financial_date_finance_umap_3_516aa6cd,f_economic_date_cbpol_umap_6_aa0352de,f_economic_date_cbpol_umap_1_7eb7a343,f_weather_date_weather_umap_34_c3ef5b4f,f_weather_date_weather_umap_45_d474bf8d,f_economic_date_cpi_umap_4_970cc061
0,2013-01-01,7,5,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
1,2013-01-01,4,9,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
2,2013-01-01,1,33,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
3,2013-01-01,3,41,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208
4,2013-01-01,5,24,29.676683,0.781831,0.62349,4.540985,5.828106,4.644803,0.98522,...,0,3019.51001,1.065267,-0.323471,6.598458,1.367325,4.815701,5.664261,4.923654,10.153208


In [None]:
model.fit(train_features, train_target)
preds= model.predict(test_features)
eval_metric(test_target.values,preds,"SMAPE")

[18.295517483678655]

In [None]:
model.fit(enriched_train_features, train_target)
enriched_preds= model.predict(enriched_test_features)
eval_metric(test_target.values,enriched_preds,"SMAPE")

[13.724625526675501]