In [19]:
import pandas as pd
import os

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce

pd.options.display.max_columns = 1999
pd.options.display.max_rows = 1999

In [2]:
def fill_with_pred(target, filler):
    for i, val in enumerate(target):
        if val != val:
            target.iloc[i] = filler[i]
    return target

def custom_metric(actuals, forecast, avg_volume):
    """
    This function aims to compute the Custom Accuracy Metric
    for the Novartis Datathon, 3rd edition.

    Given the actuals followed by the forecast and the avg_volume
    of the brand, it will compute the metric score.

    Keyword parameters:
        actuals (float vector): Real value of Y
        forecast (float vector): Volume forecast
        avg_volume (float): Average monthly volume of the 12 months
                            prior to the generic entry.

    Returns:
        custom_metric: Uncertainty Metric score (%)
    """

    # Compute the first part of the equation
    # (custom MAPE with Average volume)
    custom_mape = sum(abs(actuals - forecast)) / (24 * avg_volume)

    # Compute the second part of the equation
    # (custom 6-first-months MAPE with Average volume)
    six_month_mape = \
        abs(sum(actuals[:6]) - sum(forecast[:6])) / (6 * avg_volume)

    # Compute the third part of the equation
    # (custom 6-months MAPE with Average volume)
    twelve_month_mape = \
        abs(sum(actuals[6:12]) - sum(forecast[6:12])) / (6 * avg_volume)

    # Compute the fourth part of the equation
    # (custom 12-months MAPE with Average volume)
    last_month_mape = \
        abs(sum(actuals[12:]) - sum(forecast[12:])) / (12 * avg_volume)

    # Compute the custom metric
    custom_metric = 0.5 * custom_mape + 0.3 * six_month_mape + \
        0.1 * (twelve_month_mape + last_month_mape)

    return custom_metric * 100

In [3]:
generics_count = pd.read_csv('data/gx_num_generics.csv')
package = pd.read_csv('data/gx_package.csv')
gx_volume = pd.read_csv('data/gx_volume.csv')
data_merged = pd.read_csv('data/dt_merged_w.csv')
submission_template = pd.read_csv('data/submission_template.csv', index_col=0)

In [4]:
data_test = data_merged[data_merged['test']]
data_train = data_merged[data_merged['test'] == False]

In [5]:
# fill NA for channels:
data_train[['A', 'B', 'C', 'D']] = data_train[['A', 'B', 'C', 'D']].fillna(0)
data_test[['A', 'B', 'C', 'D']] = data_test[['A', 'B', 'C', 'D']].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [6]:
# define X, X_test, y_train, (y_test)
X = data_train.iloc[:, :9]
y_train = data_train.iloc[:, -24]
X_test = data_test.iloc[:, :9]
y_test = data_test.iloc[:, -24]

# target encoding
encoder = ce.TargetEncoder(cols=['brand'])
encoder.fit(X, y_train)
X = encoder.transform(X)
X_test = encoder.transform(X_test)

# get dummies
X = pd.get_dummies(X)
X_test = pd.get_dummies(X_test)

In [7]:
# force same features in test and train:
col_to_use = X.columns[X.columns.isin(X_test.columns)]
len(col_to_use)

X = pd.concat([X[col_to_use]], axis=1)
X_test = X_test[col_to_use]

print(X_test.shape)
print(y_train.shape)
print(X.shape)

(191, 38)
(887,)
(887, 38)


## iterative prediction:

In [14]:
for i in range(24):
    print(i)
    
    # fit model:
    regressor = DecisionTreeRegressor(random_state=0, criterion='mae')
    regressor.fit(X, y_train)
    
    # predict:
    y_pred_in_sample = regressor.predict(X)
    y_pred_out_of_sample = regressor.predict(X_test) 
    
    # fill nans of y_train with i.s. predictions if need be
    # fill nans of y_test with o.o.s. predictions: (use predictions only when true values not known!!)
    y_train = data_train.iloc[:, -24+i]
    y_train = fill_with_pred(y_train.copy(), y_pred_in_sample)
    y_test = data_test.iloc[:, -24+i]
    y_test = fill_with_pred(y_test.copy(), y_pred_out_of_sample)    

    X = pd.concat([X, y_train], axis=1)
    X_test = pd.concat([X_test, y_test], axis=1)

    # fill missing values y:
    #y_pred_in_sample = y_pred_in_sample.fillna(0)
#    y_test = data_merged.loc[list(y_test.index.values)].iloc[:,-23+i]
    
# kick out NAs for mae computation
#y_pred_temp = y_pred[~y_test.reset_index(drop=True).isnull()]
#y_test_temp = y_test[~y_test.isnull()]

#mae_list.append(mean_absolute_error(y_test_temp, y_pred_temp))
    

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23


In [20]:
#X_test

In [17]:
X_test.to_csv('predictions.csv', index=False)

In [18]:
#plt.plot(X_test.iloc[1, -23:])