In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, learning_curve
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, ElasticNet

# Load Data

In [2]:
data_path = 'C:/Users/ticom/VLabs/Data'
df = pd.read_csv(data_path+'/sales_20_21_train.csv', header=0, parse_dates=['DT_VENDA'])
# df = pd.read_csv('https://raw.githubusercontent.com/marcos-mansur/vlabs-challenge/main/Data/sales_20_21_train.csv', header=0, parse_dates=['DT_VENDA'])
df_sub = pd.read_csv(data_path+'/sample_submission.csv',header=0)

# Functions

In [23]:
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate plot: learning curve

    """
    if axes is None:
        _, axes = plt.subplots(1, figsize=(20, 10))

    axes.set_title(title)
    if ylim is not None:
        axes.set_ylim(*ylim)
    axes.set_xlabel("Training examples")
    axes.set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes, scoring='neg_root_mean_squared_error',
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes.grid()
    # intervalos de spread 
    axes.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    # curvas
    axes.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes.legend(loc="best")
    return plt

In [3]:
def split_per_dates(df,train_start,target_start, sub_start):
    # target do treinamento
    df_target = df[df.DT_VENDA > target_start]
    y_train = df_target.groupby('ID_CLIENTE')['VALOR'].sum()
    # dados de treinamento
    df_train = df[(df['DT_VENDA']>train_start) & (df['DT_VENDA']<target_start)].copy()
    # dados para previsão
    df_test = df[df['DT_VENDA']>sub_start]
    return df_train, y_train, df_test

# Preprocessing

A função abaixo recebe o df cru e retorna o df final do modelo. Feature engineer e outras mudanças devem ser feitas preferencialmente aqui porque ao fim dessa transformação perdemos os dados abertor por compra.

In [4]:
def treated_data(df_pp):
    max_date = df_pp['DT_VENDA'].max()
    
    #agrupar por cliente somando o valor
    cust_revenue = df_pp.groupby(['ID_CLIENTE'])['VALOR'].sum().copy()
    # transformar em df
    cust_rev = pd.DataFrame(cust_revenue)
    # feature com número de compras feitas no período de teste
    cust_rev['Frequency'] = df_pp.groupby(['ID_CLIENTE'])['ID_VENDA'].count()
    
    # std entre os valores das compras de cada cliente
    #cust_rev['std entre compras'] = df_pp.groupby('ID_CLIENTE')['VALOR'].std()
    # diferença entre a data da primeira e ultima compra
    #cust_rev['periodo_compras'] = df_pp.groupby(['ID_CLIENTE'])['DT_VENDA'].max() - df_pp.groupby(['ID_CLIENTE'])['DT_VENDA'].min()
    #cust_rev['periodo_compras'] = cust_rev['periodo_compras'].apply(lambda x: x.days)

    # meadia entre os valores das compras de cada cliente
    #cust_rev['valor medio'] = df_pp.groupby('ID_CLIENTE')['VALOR'].mean()
    
    
    # data da ultima compra
    cust_rev['Recency'] = max_date - df_pp.groupby(['ID_CLIENTE'])['DT_VENDA'].max()
    cust_rev['Recency'] = cust_rev['Recency'].apply(lambda x: x.days)
    # soma do VALOR dos ultimos 3 meses de cada cliente
    cust_rev['valor_3m'] = df_pp[
            df_pp['DT_VENDA']>(df_pp['DT_VENDA'].max() - delta90)].groupby(['ID_CLIENTE'])['VALOR'].sum()
    
    return cust_rev.fillna(0)

In [6]:
# intervalo de 90 dias
delta90 = pd.to_timedelta(90,unit='d')
# data mais recente do dataset
max_date = df['DT_VENDA'].max()
# data de início do target
target_start = max_date - delta90
# data de início dos dados de treino
train_start = target_start - 2*delta90
# data de início dos dados de previsão (submissão)
sub_start = max_date - 2*delta90

In [7]:
df_train, target_train, df_test = split_per_dates(df,train_start,target_start,sub_start)

In [8]:
df2_train = treated_data(df_train)
df2_test = treated_data(df_test)

In [18]:
# juntando o dataset com o target para alinhar o index
df3_train = df2_train.join(other=target_train, on='ID_CLIENTE', lsuffix='_sum', rsuffix='_TARGET')
x_train_total = df3_train.drop('VALOR_TARGET',axis=1)
y_train_total = df3_train.VALOR_TARGET.fillna(0)

In [19]:
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train_total, 
                                                                            y_train_total, 
                                                                            random_state=0,
                                                                            test_size=0.3)

# Model

In [None]:
en = ElasticNet(random_state=0)
en.fit(x_train_split, y_train_split)
en_pred = en.predict(x_test_split)

In [None]:
score = mean_squared_error(y_test_split,en_pred, squared=False)
score

In [None]:
title = "Learning Curves - ElasticNet"
plot_learning_curve(en, title, x_train_total, y_train_total, cv=5, n_jobs=-1)

# Submission

In [14]:
en = ElasticNet(random_state=0)
en.fit(x_train_total, y_train_total)
en_pred = en.predict(df2_test)

en_pred_df = pd.DataFrame(en_pred, index=df2_test.index).reset_index()
en_pred_df.columns = ['ID_CLIENTE', 'VALOR']
sub5 = pd.DataFrame(df_sub['ID_CLIENTE']).set_index('ID_CLIENTE').join(en_pred_df.set_index('ID_CLIENTE'),
                                           on='ID_CLIENTE')
#sub5.reset_index().fillna(0).to_csv('C:/Users/ticom/VLabs/Submissions/sub5.csv', index=False)

In [20]:
sub5

Unnamed: 0_level_0,VALOR
ID_CLIENTE,Unnamed: 1_level_1
4,471.272490
9,77.601815
12,278.050066
15,157.540902
19,55.723262
...,...
384409,67.292199
384411,159.056544
384415,42.367053
384418,163.083889
