### Import useful libraries

In [1]:
%run "Imports.ipynb"
%run "Helpers.ipynb"

### Load data

In [2]:
data = pd.read_pickle("data_with_prophet.pkl")

In [3]:
data.head()

Unnamed: 0,zone,timestamp,supply,forecast,date,hour,prophet_forecast
0,0_0_10_10,2019-01-04 14:00:00,1,0.0,2019-01-04,14:00:00,
1,0_0_10_10,2019-01-04 15:00:00,0,0.0,2019-01-04,15:00:00,
2,0_0_10_10,2019-01-04 16:00:00,0,0.0,2019-01-04,16:00:00,
3,0_0_10_10,2019-01-04 17:00:00,0,0.0,2019-01-04,17:00:00,
4,0_0_10_10,2019-01-04 18:00:00,0,0.0,2019-01-04,18:00:00,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 951758 entries, 0 to 951757
Data columns (total 7 columns):
zone                951758 non-null object
timestamp           951758 non-null datetime64[ns]
supply              951758 non-null int64
forecast            951758 non-null float64
date                951758 non-null object
hour                951758 non-null object
prophet_forecast    84602 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 50.8+ MB


### Helpers

In [5]:
def divide_into_train_and_test(df,cut_date,timestamp):
    train = df[ df[timestamp] < cut_date ]
    test = df[ df[timestamp] >= cut_date ]    
    return train, test

In [6]:
def calculate_zones_mean_supply(train):
    zone_names = train['zone'].unique()
    MeanSupplyDict = {elem : pd.DataFrame for elem in zone_names}

    for key in MeanSupplyDict.keys():
        MeanSupplyDict[key] = train['supply'][train.zone == key].mean()
        
    return MeanSupplyDict

In [7]:
def calculate_zones_max_supply(train):
    zone_names = train['zone'].unique()
    MaxSupplyDict = {elem : pd.DataFrame for elem in zone_names}

    for key in MaxSupplyDict.keys():
        MaxSupplyDict[key] = train['supply'][train.zone == key].max()
        
    return MaxSupplyDict

In [8]:
def calculate_zones_mean_improvement(train):
    zone_names = train['zone'].unique()
    resultDict = {elem : pd.DataFrame for elem in zone_names}

    for key in resultDict.keys():
        resultDict[key] = train['prophet_improvement'][train.zone == key].mean()
        
    return resultDict

In [9]:
def calculate_zones_sum_improvement(train):
    zone_names = train['zone'].unique()
    resultDict = {elem : pd.DataFrame for elem in zone_names}

    for key in resultDict.keys():
        resultDict[key] = train['prophet_improvement'][train.zone == key].sum()
        
    return resultDict

### Divide into train and test

In [10]:
train, test = divide_into_train_and_test(data,'2019-12-31 00:00:00','timestamp')

In [11]:
test['supply'] = test['supply'].astype(float)

In [12]:
test['diff_naive_forecast'] = abs(test['supply'] - test['forecast']).round(0)

In [13]:
test['diff_prophet_forecast'] = abs(test['supply'] - test['prophet_forecast']).round(0)

In [14]:
test['prophet_improvement'] = test['diff_naive_forecast'] - test['diff_prophet_forecast']

### Calculate zone metrics

In [15]:
mean_supply_dict = calculate_zones_mean_supply(train)

In [16]:
data['zone_mean_supply'] = data['zone'].map(mean_supply_dict)

In [17]:
max_supply_dict = calculate_zones_max_supply(train)

In [18]:
data['zone_max_supply'] = data['zone'].map(max_supply_dict)

In [19]:
mean_improvement_dict = calculate_zones_mean_improvement(test)

In [20]:
data['zones_mean_improvement'] = data['zone'].map(mean_improvement_dict)

In [21]:
sum_improvement_dict = calculate_zones_sum_improvement(test)

In [22]:
data['zones_sum_improvement'] = data['zone'].map(sum_improvement_dict)

In [23]:
data.sample()

Unnamed: 0,zone,timestamp,supply,forecast,date,hour,prophet_forecast,zone_mean_supply,zone_max_supply,zones_mean_improvement,zones_sum_improvement
321571,3_4_10_10,2019-03-27 06:00:00,7,4.0,2019-03-27,06:00:00,,8.31147,49,0.53588,463.0


In [24]:
train, test = divide_into_train_and_test(data,'2019-12-31 00:00:00','timestamp')

In [25]:
test.head()

Unnamed: 0,zone,timestamp,supply,forecast,date,hour,prophet_forecast,zone_mean_supply,zone_max_supply,zones_mean_improvement,zones_sum_improvement
8650,0_0_10_10,2019-12-31 00:00:00,0,0.0,2019-12-31,00:00:00,0.053345,0.026127,2,0.016905,13.0
8651,0_0_10_10,2019-12-31 01:00:00,0,0.0,2019-12-31,01:00:00,0.068624,0.026127,2,0.016905,13.0
8652,0_0_10_10,2019-12-31 02:00:00,0,0.0,2019-12-31,02:00:00,0.073942,0.026127,2,0.016905,13.0
8653,0_0_10_10,2019-12-31 03:00:00,0,0.0,2019-12-31,03:00:00,0.070567,0.026127,2,0.016905,13.0
8654,0_0_10_10,2019-12-31 04:00:00,0,0.0,2019-12-31,04:00:00,0.076647,0.026127,2,0.016905,13.0


In [26]:
# data.to_pickle("data_with_metrics.pkl")