In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics   #Additional scklearn functions
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow import keras
import tensorflow as tf
from datetime import datetime
from datetime import timedelta
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import seaborn as sns
from category_encoders import TargetEncoder
sns.set()
pd.set_option('display.max_rows', 90)
pd.set_option('display.max_columns', 50)

# Data preparation

In [97]:
binary_cols = ['Pricing, Delivery_Terms_Quote_Appr', 'Pricing, Delivery_Terms_Approved', 
           'Bureaucratic_Code_0_Approval','Bureaucratic_Code_0_Approved']

categorical_cols = ['Region','Territory','Bureaucratic_Code', 
                'Source ','Billing_Country','Account_Name',
                'Opportunity_Name', 'Account_Owner',
               'Opportunity_Owner','Account_Type', 'Opportunity_Type', 
                'Quote_Type', 'Delivery_Terms', 'Brand', 'Product_Type', 
                'Size','Product_Category_B', 'Currency', 'Last_Modified_By',
                'Product_Family', 'Product_Name', 'ASP_Currency',
                'ASP_(converted)_Currency','Delivery_Quarter',
                'Total_Taxable_Amount_Currency',
                'Prod_Category_A', 'Total_Amount_Currency'
               ]
float_cols = ['ASP', 'ASP_(converted)', 
              'TRF', 'Total_Amount', 
              'Total_Taxable_Amount', 'ID']


datetime_cols = ['Account_Created_Date', 'Opportunity_Created_Date',
             'Last_Activity','Quote_Expiry_Date', 'Last_Modified_Date',
             'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date',
             'Actual_Delivery_Date', 
            ]
other_cols = ['Month', 'Delivery_Year', 'Price']

target_col = ['Stage']
    
######### debe ser eliminada ya que aumentara el desempeño del predictor ahora pero en
######### un caso real nunca se tendra el valor de esta variable ya que parece ser seteado
######### una vez que la oportunidad resulta en exito.
target_leakage = ['Sales_Contract_No']

def discard_not_closed(df):
    return df.loc[df['Stage'].apply(lambda x: x in ['Closed Won', 'Closed Lost'])]

def process_data(df):

    # Tipos de variables segun la decr del dataset
    
    # Saco algunas columnas
    # demasiados categorias
    many_cats = ['Opportunity_Name']
    # por gran mayoria de nones:
    nones_may = ['Product_Type', 'Product_Category_B','Currency', 'Size', 'Brand']
    # por similitud a ASP_Currency
    sim_vals = ['Total_Taxable_Amount_Currency'] 
    # posee un solo valor o casi
    one_val = ['Prod_Category_A', 'ASP_(converted)_Currency','Quote_Type', 'Submitted_for_Approval']
    # Id columns
    id_cols = ['ID']
    #datetime nones
    none_date = ['Last_Activity','Actual_Delivery_Date']

    
    
    global categorical_cols, float_cols, datetime_cols
    categorical_cols = list(set(categorical_cols) - set(nones_may + sim_vals + one_val  + many_cats))
    float_cols = list(set(float_cols) - set(id_cols + target_leakage))
    datetime_cols = list(set(datetime_cols) - set(none_date))

    df = df.loc[:, categorical_cols + binary_cols + float_cols + datetime_cols]
    
    # Completo Nones
    df.loc[df['Territory']=='None', 'Territory'] = df.loc[df['Territory']=='None', 'Region'] 

    # Completo NaN
    df['ASP_(converted)'].fillna(0, inplace=True)
    df['ASP'].fillna(0, inplace=True)
    df['Total_Amount'].fillna(0, inplace=True)

    # Convierto los tipos de variables segun la descr. del df
    df[binary_cols] = df[binary_cols].astype('bool')
    df[categorical_cols] = df[categorical_cols].astype('category')
    df[float_cols] = df[float_cols].astype('float64')
    #df[target_leakage] = df[target_leakage].astype('int')
    for col in datetime_cols:
        df[col] = pd.to_datetime(df[col])
    
    #completo end delivery date con la suma entre Planned_Delivery_End_Date y el promedio de su diferencia con Planned_Delivery_End_Date
    diff_delivery = df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']
    start_plus_mean = df.loc[df['Planned_Delivery_End_Date'].isnull(),'Planned_Delivery_Start_Date'] + diff_delivery.mean()
    df.loc[df['Planned_Delivery_End_Date'].isnull(), 'Planned_Delivery_End_Date'] = start_plus_mean
    
    
    #preparo los grupos segun la oportunidad y
    # un grupo con la primer columna de cada grupo
    groups = df.groupby(df.index)
    df_first = df.reset_index().drop_duplicates(subset='Opportunity_ID', keep='first').set_index('Opportunity_ID')
    df_count = groups.size().to_frame('nb_products')
    df_count['only_one_product'] = df_count['nb_products'] == 1 
    
    
    groups_float = groups[df.select_dtypes(include=['float','int']).columns]
    df_float = groups_float.agg(['mean','std','min','max','sum']).fillna(0)
    
    def sin_cos_date(df, col, total):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / total)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / total)
        df.drop(col, axis=1, inplace=True)
        return df
    
    datetime_cols = df.select_dtypes(include='datetime').columns
    df_datetime = df_first[datetime_cols]

    #datetime vars engineer
    # la diff entre todo los end y start de delivery es siempre la misma, por lo que me puedo quedar con la diff de cualquiera en la oportunidad
    # con respecto al start me puedo quedar con el maximo y el minimo
    df_datetime.loc[:,'diff_minmax_Planned_Delivery_Start_Date'] = (groups['Planned_Delivery_Start_Date'].max() - groups['Planned_Delivery_Start_Date'].min()).dt.days
    df_datetime.loc[:,'diff_delivery'] = (df_datetime.loc[:,'Planned_Delivery_End_Date'] - df_datetime.loc[:,'Planned_Delivery_Start_Date']).dt.days
    #cuanto tiempo paso desde la oportunidad se creo hasta el comienzo del delivery (me quedo con el mean pq cada producto tiene su propio start_delivery)
    #df_datetime['diff_created_delivery'] = groups.apply(lambda x : (x.Planned_Delivery_Start_Date - x.Opportunity_Created_Date).mean().days)
    df_datetime.loc[:,'diff_account_opportunity'] = (df_datetime.loc[:,'Opportunity_Created_Date'] - df_datetime.loc[:,'Account_Created_Date']).dt.days
    df_datetime.loc[:,'is_account_first'] = df_datetime.loc[:,'Account_Created_Date'] > df_datetime.loc[:,'Opportunity_Created_Date']
    df_datetime.loc[:,'has_quote_exp'] = df_datetime.loc[:,'Account_Created_Date'] > df_datetime.loc[:,'Opportunity_Created_Date']
    
    df_datetime.loc[:, 'has_Quote_Expiry_Date'] = ~ df_datetime.loc[:, 'Quote_Expiry_Date'].isnull()
    df_datetime.drop('Quote_Expiry_Date', axis=1, inplace=True)
    
    for col in df_datetime.select_dtypes(include='datetime').columns:
        df_datetime.loc[:,col + '_weekofyear'] = df_datetime[col].dt.weekofyear
        #df_datetime = sin_cos_date(df_datetime, col + '_weekofyear', 52)
        df_datetime.loc[:,col + '_year'] = df_datetime[col].dt.year
        df_datetime.drop(col, axis=1, inplace=True)
    
    
    #if with_target_leakage:
    #    df_target_leackage = (df_first.Sales_Contract_No == 'None').to_frame('Empty_Contract')
    
    df_cat_news = df_first['Source '].isnull().to_frame('Empty_Source')
    df_cat_first = df_first.select_dtypes(include='category')
    df_cat_first.pop('Source ')
    
    X = pd.DataFrame(index= df_first.index)
    X = X.join([df_count, df_float, df_datetime, df_cat_news, df_cat_first])
    #[df_count, df_float, df_datetime, df_cat_news, df_cat_first]
    return X

def create_pipe(clf):
    
    numerical_transformer = MinMaxScaler()
    categorical_transformer = TargetEncoder()

    numerical_cols = X.select_dtypes(include='float').columns
    categorical_cols = X.select_dtypes(include='category').columns

    preprocessor_cat = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_cols)
        ],
        remainder='passthrough'
    )
    
    preprocessor_num = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, X.columns),
        ],
    )

    clf = XGBClassifier(learning_rate =0.1, 
                        n_estimators=1000, 
                        max_depth=3,
                        min_child_weight=1, 
                        gamma=0, 
                        subsample=1, 
                        colsample_bytree=0.6,
                        objective= 'binary:logistic', 
                        nthread=4,
                        reg_alpha=.01,
                        seed=27)

    pipe = Pipeline(steps=[('preprocessor_cat', preprocessor_cat),
                           ('preprocessor_num', preprocessor_num),
                           ('model', clf)])
    return pipe

In [99]:
df_train = pd.read_csv('CompetenciaECI/Entrenamieto_ECI_2020.csv', index_col = 'Opportunity_ID')
df_val = pd.read_csv('CompetenciaECI/Validacion_ECI_2020.csv', index_col = 'Opportunity_ID')
df_train = discard_not_closed(df_train)
df_y = df_train.pop('Stage')
y = df_y.groupby(df_y.index).first()
y = (y=='Closed Won') * 1 
X = process_data(df_train)
X_val = process_data(df_val)
X, X_val = X.align(X_val, axis=1)

In [94]:
X = pd.DataFrame(preprocessor.fit_transform(X,y), columns=X.columns, index=X.index)
X_val = pd.DataFrame(pxreprocessor.transform(X_val), columns=X.columns, index=X_val.index)

# Clasificadores

## Tunning de XGBoost 

* Seteo de cantidad de iteraciones

In [101]:
def feat_imp(pipe):
    feat_imp = pd.Series(pipe.named_steps['model'].get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances', figsize=(13,4))
    plt.ylabel('Feature Importance Score')

In [103]:
#Choose all predictors except target & IDcols
target = 'Stage'
train = pd.concat([X,y.to_frame(target)], axis=1)
predictors = [x for x in train.columns if x not in [target]]
xgb1 = XGBClassifier(
    learning_rate =0.1,
    n_estimators=1000,
    max_depth=5,
    gamma=0,
    subsample=0.8,
    colsample_bytree=0.8,
    objective= 'binary:logistic',
    seed=27)
pipe = create_pipe(xgb1)


In [None]:
def cv_xgb(clf):
    

* Tuneo de Profundidad del arbol y ...

In [102]:
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=140, 
              max_depth=5,
              min_child_weight=1, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       param_test1 = { 
           'max_depth':range(3,10,1), 
           'min_child_weight':range(1,6,1)
       }
      )

NameError: name 'cv_xgb' is not defined

* Tuneo de gamma

In [None]:
param_test2 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=140, 
              max_depth=5,
              min_child_weight=1, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       param_test1 = { 
           'max_depth':range(3,10,1), 
           'min_child_weight':range(1,6,1)
       }
      )

* Tuneo de porcion del datset de entrenamiento usado para entrenar cada arbol y porcentaje de columnas

In [None]:
param_test3 = {
 'subsample':[i/10.0 for i in range(6,11)],
 'colsample_bytree':[i/10.0 for i in range(6,11)]
}
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=140, 
              max_depth=5,
              min_child_weight=1, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       param_test1 = { 
           'max_depth':range(3,10,1), 
           'min_child_weight':range(1,6,1)
       }
      )

* Tuneo de alpha para regularizacion L2

In [None]:
param_test4 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=140, 
              max_depth=5,
              min_child_weight=1, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       param_test1 = { 
           'max_depth':range(3,10,1), 
           'min_child_weight':range(1,6,1)
       }
      )

# Pipeline

In [None]:
pipe.fit(X_train,y_train)
print( log_loss(y_test, pipe.predict_proba(X_test)[:,1]))

In [None]:
X_test.info()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
clf = XGBClassifier(learning_rate =0.1, 
                    n_estimators=1000, 
                    max_depth=3,
                    min_child_weight=1, 
                    gamma=0, 
                    subsample=1, 
                    colsample_bytree=0.6,
                    objective= 'binary:logistic', 
                    nthread=4,
                    reg_alpha=.01,
                    seed=27)
clf.fit(X_train,y_train, 
        eval_set=[(X_test,y_test)], 
        eval_metric='logloss', 
        early_stopping_rounds=20,
       verbose=False)
print( log_loss(y, clf.predict_proba(X)[:,1]))

# XGB sin tuning

In [None]:
### fit model no training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)
clf = xgb.XGBClassifier(gamma=3.5, 
                        n_estimators=1000, 
                        silent=1, 
                        eta=.1,
                        subsample=.8,
                        colsample_bytree=.8,
                        eval_metric='logloss')
clf.fit(X_train,y_train)
#print(clf.score(X_train,y_train))
print( log_loss(y_train, clf.predict_proba(X_train)[:,1]))
print( log_loss(y_test, clf.predict_proba(X_test)[:,1]))

In [None]:
xgb.plot_importance(clf, max_num_features=20, importance_type='weight')

In [None]:
scores = -1 * cross_val_score(xgb.XGBClassifier(), X, y,
                              cv=10,
                              scoring='neg_log_loss')
print(f'mean: {scores.mean()} std: {scores.std()}')

# Logistic Classifier

In [None]:
clf = LogisticRegression(max_iter=400)
scores = -1 * cross_val_score(clf, X, y,
                              cv=10,
                              scoring='neg_log_loss')
print(f'mean: {scores.mean()} std: {scores.std()}')

# Prepare results to upload

In [None]:
clf.fit(X,y)
y_pred = clf.predict_proba(X_val)[:,1]
results = pd.DataFrame(data = {'Opportunity_ID': X_val.index, 'score': y_pred})   
results.to_csv(f'results/result{datetime.now()}.csv', index=False, header=False)