In [20]:
'''
%pip install pandas
%pip install scikit-learn
%pip install xgboost
'''

'\n%pip install pandas\n%pip install scikit-learn\n%pip install xgboost\n'

In [21]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor

## Data loading

In [22]:
df_bu_feat = pd.read_csv("data/bu_feat.csv.gz") 
df_train = pd.read_csv("data/train.csv.gz") 
df_test = pd.read_csv("data/test.csv.gz")

### Merging features

In [23]:
df_train_feat = pd.merge(df_train, df_bu_feat, how="left", on = "but_num_business_unit")
df_test_feat = pd.merge(df_test, df_bu_feat, how="left", on = "but_num_business_unit")

### Split train, val set

In [24]:
df_train_feat

Unnamed: 0,day_id,but_num_business_unit,dpt_num_department,turnover,but_postcode,but_latitude,but_longitude,but_region_idr_region,zod_idr_zone_dgr
0,2017-09-30,64,127,580.308443,16400,45.625172,0.111939,70,10
1,2017-09-30,119,127,1512.995918,74100,46.195037,6.254448,51,4
2,2017-09-30,4,88,668.593556,6600,43.600994,7.078160,55,10
3,2017-09-30,425,127,0.000000,59000,50.617921,3.084186,33,3
4,2017-09-30,513,73,0.000000,33610,44.717366,-0.733429,33,3
...,...,...,...,...,...,...,...,...,...
277714,2012-12-29,131,73,1.461821,69760,45.822363,4.767741,8,4
277715,2012-12-29,237,127,759.283046,13500,43.430995,5.047929,71,10
277716,2012-12-29,129,117,1716.399152,14124,49.154936,-0.287441,30,6
277717,2012-12-29,468,127,1307.357057,94320,48.757857,2.385381,75,6


In [25]:
# Train and val set

df_train_feat["day_id"] = pd.to_datetime(df_train_feat["day_id"])
df_train_feat["day_id_week"] = df_train_feat.day_id.dt.isocalendar().week
df_train_feat["day_id_month"] = df_train_feat["day_id"].dt.month
df_train_feat["day_id_year"] = df_train_feat["day_id"].dt.year

df_train = df_train_feat[(df_train_feat.day_id_year < 2017)]
df_val = df_train_feat[(df_train_feat.day_id_year == 2017)]

y_train = df_train.turnover
y_val = df_val.turnover


### Scikit pipeline

In [26]:
from sklearn.base import BaseEstimator, TransformerMixin

In [27]:
class CustomPreprocressing(BaseEstimator, TransformerMixin):
    """
    This class includes all the steps for the preprocessing
    """
    def __init__(self, cat_cols):
        """
        Initialize the class / Can be empty
        """
        self.cat_cols = cat_cols

    def fit(self, X, y=None):
        """
        This method is only created so that the pipeline containing this transformer does not raise an error
        """
        return self

    def transform(self, data):
        """
        Inputs :
          -- data : DataFrame, DataFrame contening all the data needed for the model
        Outputs :
          -- DataFrame, DataFrame prepared for modeling

        """
        data["day_id"] = pd.to_datetime(data["day_id"])
        data["day_id_week"] = data.day_id.dt.isocalendar().week
        data["day_id_month"] = data["day_id"].dt.month
        data["day_id_year"] = data["day_id"].dt.year
        data[self.cat_cols] = data[self.cat_cols].apply(lambda x: x.astype(str))
        return data

In [28]:

num_attrib = ["but_latitude","but_longitude", 'day_id_year']
cat_attrib = [
            "day_id_week",
            "day_id_month",
            "but_region_idr_region",
            "zod_idr_zone_dgr",
            "but_num_business_unit",
            "dpt_num_department",
        ]

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
])
cat_onehot_pipeline = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown="ignore")),
])
preparation_pipeline = ColumnTransformer([
    ("num",num_pipeline, num_attrib),
    ("cat_onehot", cat_onehot_pipeline, cat_attrib)
])

full_pipeline = Pipeline([
    ('preprocessing', CustomPreprocressing(cat_cols=cat_attrib )),
    ('preparation', preparation_pipeline),
    ('model', GradientBoostingRegressor())
])

In [43]:
#model_final = full_pipeline.fit(df_train, y_train)
#y_predict_val = model_final.predict(df_val)

#metric_mae = mean_absolute_error(y_val, y_predict_val)
#print(metric_mae)

In [44]:
#df_val['prediction'] = y_predict_val
#filter_ts = lambda x: (x.but_num_business_unit=="32") & (x.dpt_num_department=='73')
#display(df_val[filter_ts])

#display(df_train[filter_ts])

In [31]:
model_final = full_pipeline.fit(df_train_feat, df_train_feat.turnover.values)

In [33]:
y_pred = model_final.predict(df_test_feat)
df_test_feat['prediction'] = y_pred

##### Model Serialization

In [38]:
import joblib 
# Save the model
joblib.dump(model_final,"../turnover_forecasting_model.pkl")

['../turnover_forecasting_model.pkl']

##### Model Loading

In [39]:
model = joblib.load('../turnover_forecasting_model.pkl')

In [41]:
model

##### Testing Prediction

In [40]:
from pydantic import BaseModel
from datetime import datetime

class DonneesEntree(BaseModel):
    but_num_business_unit: int  # store's ID
    dpt_num_department: int  # department's ID
    but_postcode: int  # postal code
    but_latitude: float  # store's latitude
    but_longitude: float  # store's longitude
    but_region_idr_region: int  # region's number
    zod_idr_zone_dgr: int  # zone's number
    day_id : datetime # date

request={
    "but_num_business_unit":100,
    "dpt_num_department":112,
    "but_postcode":75000,
    "but_latitude":43.34,
    "but_longitude":34.83,
    "but_region_idr_region":23,
    "zod_idr_zone_dgr":10,
    "day_id":2017-11-25
}

donnees = DonneesEntree(**request)
donnees_df = pd.DataFrame([donnees.dict()])  # Conversion en DataFrame
'''
donnees_df["day_id"] = pd.to_datetime(donnees_df["day_id"])
donnees_df["day_id_week"] = donnees_df.day_id.dt.isocalendar().week
donnees_df["day_id_month"] = donnees_df["day_id"].dt.month
donnees_df["day_id_year"] = donnees_df["day_id"].dt.year
'''
int(model.predict(donnees_df)[0])

1334