# BIP's submission function

A function which goal is to return a dataframe with the exact format of the predicted dataset to be delivered. 

It starts from a dataframe with predictions **NOT yet summed by moth **.

The input dataframe should **at least** contain:

| StoreID |  D_Month | NumberOfSales | _NumberOfSales |
| --------| -------- | --------------| -------------- |  
|    1000 |        1 |   8540.000000 |    9776.962748 |  
|    1000 |        1 |  10364.000000 |    6018.866667 |  
|    1000 |        2 |   4676.000000 |    6234.658815 |  
|    1000 |        3 |   7675.446488 |    6513.012222 |  
|    1000 |        5 |   6267.000000 |    7242.277146 | 

Where: 
 - *_NumberOfSales* are **the predicted values**
 
 
### **NB: No check is done on the months. They can be any arbitrary month **

Let's start from an arbitrary test set to simulate a submission of the predicted dataset

In [1]:
from import_man import *

submission_features = ['StoreID', 'D_Month', 'NumberOfSales', '_NumberOfSales']

df = pd.read_csv('./dataset/test_m12_53_RFR_on_prep.csv')

df.head()

Unnamed: 0,StoreID,Date,IsHoliday,HasPromotions,NearestCompetitor,Region,NumberOfSales,Region_AreaKM2,Region_GDP,Region_PopulationK,...,p4,p5,p6,p7,p8,p9,p10,p11,p12,_NumberOfSales
0,1000,01/01/2018,0,0,326,7,8540,9643,17130,2770,...,-0.000163,-0.002012,0.002307,0.001049,-0.001226,0.000372,9.5e-05,0.000836,-0.00017,9908.5
1,1000,02/01/2018,0,0,326,7,10364,9643,17130,2770,...,-0.000491,-0.00207,0.002708,-0.000669,-0.001426,-0.000215,-0.00054,0.000367,-0.000163,6420.166667
2,1000,03/01/2018,0,0,326,7,4676,9643,17130,2770,...,-0.000582,-0.002125,0.003129,-0.000895,-0.001529,-0.000557,3.1e-05,-0.000107,-3.7e-05,4952.240598
3,1000,05/01/2018,0,0,326,7,6267,9643,17130,2770,...,-0.000389,0.002572,-0.000628,-0.000278,-0.001221,0.00054,0.000499,-8.7e-05,-0.000236,6538.379156
4,1000,06/01/2018,0,0,326,7,5953,9643,17130,2770,...,-0.000154,0.000616,0.003896,0.0002,-0.001384,-8.7e-05,-9.1e-05,8.5e-05,0.000273,7872.33931


In [2]:
df = df[submission_features]
df.head()

Unnamed: 0,StoreID,D_Month,NumberOfSales,_NumberOfSales
0,1000,1,8540,9908.5
1,1000,1,10364,6420.166667
2,1000,1,4676,4952.240598
3,1000,1,6267,6538.379156
4,1000,1,5953,7872.33931


In [3]:
# let's sum NumberOfSales by the store and month
df = df.groupby(['StoreID', 'D_Month']).sum()
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,NumberOfSales,_NumberOfSales
StoreID,D_Month,Unnamed: 2_level_1,Unnamed: 3_level_1
1000,1,182917,201451.830901
1000,2,166161,184721.490504
1001,1,95745,83185.502132
1001,2,88423,72102.648063
1002,1,121995,134334.706754


In [4]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,StoreID,D_Month,NumberOfSales,_NumberOfSales
0,1000,1,182917,201451.830901
1,1000,2,166161,184721.490504
2,1001,1,95745,83185.502132
3,1001,2,88423,72102.648063
4,1002,1,121995,134334.706754


In [5]:
# name conversion from our standard to BIP submission requirements
df.rename(index=str, columns={"D_Month": "Month", "NumberOfSales": "Target", "_NumberOfSales": "NumberOfSales"}, inplace=True)
df.head()

Unnamed: 0,StoreID,Month,Target,NumberOfSales
0,1000,1,182917,201451.830901
1,1000,2,166161,184721.490504
2,1001,1,95745,83185.502132
3,1001,2,88423,72102.648063
4,1002,1,121995,134334.706754


In [6]:
df.to_csv('./dataset/submission_fake.csv')

# The `apply_BIP_submission_format` function

In [7]:
def apply_BIP_submission_format(df, real_submit=False):
    """
    Given a dataframe with predictions, **NOT yet summed by month** it returns a dataframe which
    format is exactly the one required for the submission.

    If real_submit is set to False (default), the function will require (and also include in the
    outputted dataframe) the value of the *true* NumberOfSale.
    
    Required attributes (real_submit=False):
        ['StoreID', 'D_Month', 'NumberOfSales', '_NumberOfSales']
    
    Required attributes (real_submit=True):
        ['StoreID', 'D_Month', '_NumberOfSales']

    
    :param df: The data frame
    :param real_submit: Whether the dataframe returned is for the real submit or not.
    :return: Dataframe in the submit format.
    """
    submission_features = ['StoreID', 'D_Month', 'NumberOfSales', '_NumberOfSales']
    
    if real_submit:
        submission_features.remove('NumberOfSales')
    
    # create a copy of the dataframe to do not change the provided one
    df = df.copy()
    
    # Remove useless columns and select all the ones required.
    # Implicit check that all the required columns are present.
    df = df[submission_features]
    
    # sum _NumberOfSales (and NumberOfSales) by the store and month
    df = df.groupby(['StoreID', 'D_Month']).sum()
    
    df.reset_index(inplace=True)
    
    # name conversion from our standard to BIP submission standard
    columns_renamings = {
        'D_Month': 'Month',
        'NumberOfSales': 'Target',
        '_NumberOfSales': 'NumberOfSales'
    }
    # remove NumberOfSales renaming in case of real_submit
    if real_submit:
        columns_renamings.pop('NumberOfSales')
    
    df.rename(index=str, inplace=True, columns=columns_renamings)
        
    return df

### Function test

#### real_submit=False

In [8]:
df = pd.read_csv('./dataset/test_m12_53_RFR_on_prep.csv')
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,HasPromotions,NearestCompetitor,Region,NumberOfSales,Region_AreaKM2,Region_GDP,Region_PopulationK,...,p4,p5,p6,p7,p8,p9,p10,p11,p12,_NumberOfSales
0,1000,01/01/2018,0,0,326,7,8540,9643,17130,2770,...,-0.000163,-0.002012,0.002307,0.001049,-0.001226,0.000372,9.5e-05,0.000836,-0.00017,9908.5
1,1000,02/01/2018,0,0,326,7,10364,9643,17130,2770,...,-0.000491,-0.00207,0.002708,-0.000669,-0.001426,-0.000215,-0.00054,0.000367,-0.000163,6420.166667
2,1000,03/01/2018,0,0,326,7,4676,9643,17130,2770,...,-0.000582,-0.002125,0.003129,-0.000895,-0.001529,-0.000557,3.1e-05,-0.000107,-3.7e-05,4952.240598
3,1000,05/01/2018,0,0,326,7,6267,9643,17130,2770,...,-0.000389,0.002572,-0.000628,-0.000278,-0.001221,0.00054,0.000499,-8.7e-05,-0.000236,6538.379156
4,1000,06/01/2018,0,0,326,7,5953,9643,17130,2770,...,-0.000154,0.000616,0.003896,0.0002,-0.001384,-8.7e-05,-9.1e-05,8.5e-05,0.000273,7872.33931


In [9]:
df = apply_BIP_submission_format(df)
df.head()

Unnamed: 0,StoreID,Month,Target,NumberOfSales
0,1000,1,182917,201451.830901
1,1000,2,166161,184721.490504
2,1001,1,95745,83185.502132
3,1001,2,88423,72102.648063
4,1002,1,121995,134334.706754


#### real_submit=True

In [10]:
df = pd.read_csv('./dataset/test_m12_53_RFR_on_prep.csv')
df.head()

Unnamed: 0,StoreID,Date,IsHoliday,HasPromotions,NearestCompetitor,Region,NumberOfSales,Region_AreaKM2,Region_GDP,Region_PopulationK,...,p4,p5,p6,p7,p8,p9,p10,p11,p12,_NumberOfSales
0,1000,01/01/2018,0,0,326,7,8540,9643,17130,2770,...,-0.000163,-0.002012,0.002307,0.001049,-0.001226,0.000372,9.5e-05,0.000836,-0.00017,9908.5
1,1000,02/01/2018,0,0,326,7,10364,9643,17130,2770,...,-0.000491,-0.00207,0.002708,-0.000669,-0.001426,-0.000215,-0.00054,0.000367,-0.000163,6420.166667
2,1000,03/01/2018,0,0,326,7,4676,9643,17130,2770,...,-0.000582,-0.002125,0.003129,-0.000895,-0.001529,-0.000557,3.1e-05,-0.000107,-3.7e-05,4952.240598
3,1000,05/01/2018,0,0,326,7,6267,9643,17130,2770,...,-0.000389,0.002572,-0.000628,-0.000278,-0.001221,0.00054,0.000499,-8.7e-05,-0.000236,6538.379156
4,1000,06/01/2018,0,0,326,7,5953,9643,17130,2770,...,-0.000154,0.000616,0.003896,0.0002,-0.001384,-8.7e-05,-9.1e-05,8.5e-05,0.000273,7872.33931


In [11]:
df = apply_BIP_submission_format(df, real_submit=True)
df.head()

Unnamed: 0,StoreID,Month,NumberOfSales
0,1000,1,201451.830901
1,1000,2,184721.490504
2,1001,1,83185.502132
3,1001,2,72102.648063
4,1002,1,134334.706754
