In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics   #Additional scklearn functions
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer, LabelEncoder
from tensorflow import keras
import tensorflow as tf
from datetime import datetime
from datetime import timedelta
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
import seaborn as sns
from category_encoders import TargetEncoder
from category_encoders.sum_coding import SumEncoder
sns.set()
pd.set_option('display.max_rows', 90)
pd.set_option('display.max_columns', 50)

# Data preparation

In [106]:


def discard_not_closed(df):
    return df.loc[df['Stage'].apply(lambda x: x in ['Closed Won', 'Closed Lost'])]

def process_data(df, with_leackage=False):
    
    binary_cols = ['Pricing, Delivery_Terms_Quote_Appr', 'Pricing, Delivery_Terms_Approved', 
               'Bureaucratic_Code_0_Approval','Bureaucratic_Code_0_Approved']

    categorical_cols = ['Region','Territory','Bureaucratic_Code', 
                    'Source ','Billing_Country','Account_Name',
                    'Opportunity_Name', 'Account_Owner',
                   'Opportunity_Owner','Account_Type', 'Opportunity_Type', 
                    'Quote_Type', 'Delivery_Terms', 'Brand', 'Product_Type', 
                    'Size','Product_Category_B', 'Currency', 'Last_Modified_By',
                    'Product_Family', 'Product_Name', 'ASP_Currency',
                    'ASP_(converted)_Currency','Delivery_Quarter',
                    'Total_Taxable_Amount_Currency',
                    'Prod_Category_A', 'Total_Amount_Currency'
                   ]
    float_cols = ['ASP', 'ASP_(converted)', 
                  'TRF', 'Total_Amount', 
                  'Total_Taxable_Amount', 'ID']


    datetime_cols = ['Account_Created_Date', 'Opportunity_Created_Date',
                 'Last_Activity','Quote_Expiry_Date', 'Last_Modified_Date',
                 'Planned_Delivery_Start_Date', 'Planned_Delivery_End_Date',
                 'Actual_Delivery_Date', 
                ]
    other_cols = ['Month', 'Delivery_Year', 'Price']

    target_col = ['Stage']

    ######### debe ser eliminada ya que aumentara el desempeño del predictor ahora pero en
    ######### un caso real nunca se tendra el valor de esta variable ya que parece ser seteado
    ######### una vez que la oportunidad resulta en exito.
    target_leakage = ['Sales_Contract_No']
    # Tipos de variables segun la decr del dataset
    
    # Saco algunas columnas
    # demasiados categorias
    many_cats = ['Opportunity_Name']
    # por gran mayoria de nones:
    nones_may = ['Product_Type', 'Product_Category_B','Currency', 'Size', 'Brand']
    # por similitud a ASP_Currency
    sim_vals = ['Total_Taxable_Amount_Currency'] 
    # posee un solo valor o casi
    one_val = ['Prod_Category_A', 'ASP_(converted)_Currency','Quote_Type', 'Submitted_for_Approval']
    # Id columns
    id_cols = ['ID']
    #datetime nones
    none_date = ['Last_Activity','Actual_Delivery_Date']

    
    def difference_list(l1,l2):
        return [x for x in l1 if x not in l2]

    categorical_cols = difference_list(categorical_cols, nones_may + sim_vals + one_val + many_cats)
    float_cols = difference_list(float_cols, id_cols + target_leakage)
    datetime_cols = difference_list(datetime_cols, none_date)
    
    if with_leackage:
        categorical_cols += target_leakage
    
    df = df.loc[:, categorical_cols + binary_cols + float_cols + datetime_cols]
    # Completo Nones
    df.loc[df['Territory']=='None', 'Territory'] = df.loc[df['Territory']=='None', 'Region'] 

    # Completo NaN
    df['ASP_(converted)'].fillna(0, inplace=True)
    df['ASP'].fillna(0, inplace=True)
    df['Total_Amount'].fillna(0, inplace=True)

    # Convierto los tipos de variables segun la descr. del df
    df.loc[:,binary_cols] = df.loc[:,binary_cols].astype('bool')
    df.loc[:,categorical_cols] = df.loc[:,categorical_cols].astype('object')
    df.loc[:,float_cols] = df.loc[:,float_cols].astype('float64')
    #df[target_leakage] = df[target_leakage].astype('int')
    for col in datetime_cols:
        df.loc[:,col] = pd.to_datetime(df.loc[:,col])
    
    #completo end delivery date con la suma entre Planned_Delivery_End_Date y el promedio de su diferencia con Planned_Delivery_End_Date
    diff_delivery = df['Planned_Delivery_End_Date'] - df['Planned_Delivery_Start_Date']
    start_plus_mean = df.loc[df['Planned_Delivery_End_Date'].isnull(),'Planned_Delivery_Start_Date'] + diff_delivery.mean()
    df.loc[df['Planned_Delivery_End_Date'].isnull(), 'Planned_Delivery_End_Date'] = start_plus_mean
    
    
    #preparo los grupos segun la oportunidad y
    # un grupo con la primer columna de cada grupo
    groups = df.groupby(df.index)
    df_first = df.reset_index().drop_duplicates(subset='Opportunity_ID', keep='first').set_index('Opportunity_ID')
    
    df_count = groups.size().to_frame('nb_products')
    df_count.loc[:, 'only_one_product'] = df_count['nb_products'] == 1 
    
    
    groups_float = groups[df.select_dtypes(include=['float','int']).columns]
    df_float = groups_float.agg(['mean','std','min','max','sum']).fillna(0)
    
    def sin_cos_date(df, col, total):
        df[col + '_sin'] = np.sin(2 * np.pi * df[col] / total)
        df[col + '_cos'] = np.cos(2 * np.pi * df[col] / total)
        df.drop(col, axis=1, inplace=True)
        return df
    
    datetime_cols = df.select_dtypes(include='datetime').columns
    df_datetime = df_first.loc[:, datetime_cols]
    
    #datetime vars engineer
    # la diff entre todo los end y start de delivery es siempre la misma, por lo que me puedo quedar con la diff de cualquiera en la oportunidad
    # con respecto al start me puedo quedar con el maximo y el minimo
    df_datetime.loc[:,'diff_minmax_Planned_Delivery_Start_Date'] = (groups['Planned_Delivery_Start_Date'].max() - groups['Planned_Delivery_Start_Date'].min()).dt.days
    df_datetime.loc[:,'diff_delivery'] = (df_datetime.loc[:,'Planned_Delivery_End_Date'] - df_datetime.loc[:,'Planned_Delivery_Start_Date']).dt.days
    #cuanto tiempo paso desde la oportunidad se creo hasta el comienzo del delivery (me quedo con el mean pq cada producto tiene su propio start_delivery)
    #df_datetime['diff_created_delivery'] = groups.apply(lambda x : (x.Planned_Delivery_Start_Date - x.Opportunity_Created_Date).mean().days)
    df_datetime.loc[:,'diff_account_opportunity'] = (df_datetime.loc[:,'Opportunity_Created_Date'] - df_datetime.loc[:,'Account_Created_Date']).dt.days
    df_datetime.loc[:,'is_account_first'] = df_datetime.loc[:,'Account_Created_Date'] > df_datetime.loc[:,'Opportunity_Created_Date']
    df_datetime.loc[:,'has_quote_exp'] = df_datetime.loc[:,'Account_Created_Date'] > df_datetime.loc[:,'Opportunity_Created_Date']
    
    df_datetime.loc[:, 'has_Quote_Expiry_Date'] = ~ df_datetime.loc[:, 'Quote_Expiry_Date'].isnull()
    df_datetime.drop('Quote_Expiry_Date', axis=1, inplace=True)
    
    # Cantidad de oportunidades creadas los 7 y 30 dias antes respectivamente
    Opportunity_Created_Date = pd.Series(df_datetime.index, index=df_datetime.Opportunity_Created_Date).sort_index()
    count_7d = Opportunity_Created_Date.rolling('7d').count() - 1
    count_7d.index = Opportunity_Created_Date.values
    count_7d = count_7d.reindex(df_datetime.index)
    df_datetime.loc[:,'count_7d_oppotunity'] = count_7d
    
    Opportunity_Created_Date = pd.Series(df_datetime.index, index=df_datetime.Opportunity_Created_Date).sort_index()
    count_30d = Opportunity_Created_Date.rolling('7d').count() - 1
    count_30d.index = Opportunity_Created_Date.values
    count_30d = count_30d.reindex(df_datetime.index)
    df_datetime.loc[:,'count_7d_oppotunity'] = count_30d
    
    for col in df_datetime.select_dtypes(include='datetime').columns:
        df_datetime.loc[:,col + '_weekofyear'] = df_datetime.loc[:,col].dt.weekofyear
        #df_datetime = sin_cos_date(df_datetime, col + '_weekofyear', 52)
        df_datetime.loc[:,col + '_year'] = df_datetime.loc[:,col].dt.year
        df_datetime.drop(col, axis=1, inplace=True)
    
    
    if with_leackage:
        df_target_leackage = (df_first.Sales_Contract_No == 'None').to_frame('Empty_Contract')
    
    df_cat_first = df_first.select_dtypes(include='object')
    df_cat_first.loc[:, 'Empty_Source'] = df_cat_first.loc[:,'Source '].isnull()
    df_cat_first.drop('Source ', axis=1, inplace=True)
    
    X = pd.concat([df_count, df_float, df_datetime, df_cat_first], axis=1)
    
    if with_leackage:
        X = X.join(df_target_leackage)
    #X = pd.DataFrame(index= df_first.index)
    #X = X.join([df_count, df_float, df_datetime, df_cat_first])
    return X

def create_pipe(clf):
    
    numerical_transformer = MinMaxScaler()
    categorical_transformer = TargetEncoder()
    
    num_cols = X.select_dtypes(include=['float', 'int']).columns
    cat_cols = X.select_dtypes(include='object').columns
    
    preprocessor = make_column_transformer(
        (numerical_transformer, num_cols),
        (categorical_transformer, cat_cols),
        remainder='drop'
    )
    
    pipe = make_pipeline(preprocessor, FunctionTransformer(lambda x: x.astype('float64')), clf)
        
    return pipe

In [107]:
df_train = pd.read_csv('CompetenciaECI/Entrenamieto_ECI_2020.csv', index_col = 'Opportunity_ID')
df_val = pd.read_csv('CompetenciaECI/Validacion_ECI_2020.csv', index_col = 'Opportunity_ID')
df_train = discard_not_closed(df_train)
df_y = df_train.pop('Stage')
y = df_y.groupby(df_y.index).first()
y = (y=='Closed Won') * 1 
X = process_data(df_train, with_leackage=True)
X_val = process_data(df_val, with_leackage=True)
#X, X_val = X.align(X_val, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [111]:
pipe = Pipeline(create_pipe(XGBClassifier).steps[:-1])
r = pipe.fit_transform(X,y)

In [109]:
r = pipe.transform(X_val)

In [112]:
np.isnan(r).sum(), (r == None).sum()


(0, 0)

In [100]:
a = np.array([1.0,2,3], dtype=object)
np.array(list(a)).dtype

dtype('float64')

# Clasificadores

## Tunning de XGBoost 

* Seteo de cantidad de iteraciones

In [61]:
from sklearn.base import BaseEstimator
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, XGBClassifier

class XGBoostWithEarlyStop(BaseEstimator):
    def __init__(self, early_stopping_rounds=50, test_size=0.2, 
                 eval_metric='auc', **estimator_params):
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.eval_metric = eval_metric
        if self.estimator is not None:
            self.set_params(**estimator_params)

    def set_params(self, **params):
        return self.estimator.set_params(**params)

    def get_params(self, **params):
        return self.estimator.get_params()

    def fit(self, X, y):
        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size)
        self.estimator.fit(x_train, y_train, 
                           early_stopping_rounds=self.early_stopping_rounds, 
                           eval_metric=self.eval_metric, eval_set=[(x_val, y_val)])
        return self

    def predict(self, X):
        return self.estimator.predict(X)
    
    def predict_proba(self, X):
        return self.estimator.predict_proba(X)

class XGBoostClassifierWithEarlyStop(XGBoostWithEarlyStop):
    def __init__(self, *args, **kwargs):
        self.estimator = XGBClassifier(learning_rate =0.1, 
                                       n_estimators=1000,
                                       objective= 'binary:logistic',
                                       seed=4)
        super(XGBoostClassifierWithEarlyStop, self).__init__(*args, **kwargs)

In [62]:
def feat_imp(pipe):
    feat_imp = pd.Series(pipe.named_steps['clf'].get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances', figsize=(13,4))
    plt.ylabel('Feature Importance Score')

In [63]:
def cv_xgb(clf, param_grid):
    for k in list(param_grid.keys()):
        param_grid[f'clf__{k}'] = param_grid.pop(k)
    pipe = create_pipe(clf)
    gsearch = GridSearchCV(estimator = pipe,
                           scoring='neg_log_loss',
                           param_grid=param_grid,
                           n_jobs=4,
                           cv=5)
    
    gsearch.fit(X_train, y_train)
    print(gsearch.cv_results_, gsearch.best_params_, gsearch.best_score_)
    feat_imp(pipe)    


In [64]:
pipe = create_pipe(XGBoostClassifierWithEarlyStop())
pipe.fit(X,y)
# 370

[0]	validation_0-auc:0.98287
Will train until validation_0-auc hasn't improved in 50 rounds.
[1]	validation_0-auc:0.98253
[2]	validation_0-auc:0.98266
[3]	validation_0-auc:0.98343
[4]	validation_0-auc:0.98398
[5]	validation_0-auc:0.98478
[6]	validation_0-auc:0.98576
[7]	validation_0-auc:0.98638
[8]	validation_0-auc:0.98618
[9]	validation_0-auc:0.98671
[10]	validation_0-auc:0.98684
[11]	validation_0-auc:0.98691
[12]	validation_0-auc:0.98689
[13]	validation_0-auc:0.98660
[14]	validation_0-auc:0.98703
[15]	validation_0-auc:0.98620
[16]	validation_0-auc:0.98654
[17]	validation_0-auc:0.98654
[18]	validation_0-auc:0.98671
[19]	validation_0-auc:0.98648
[20]	validation_0-auc:0.98666
[21]	validation_0-auc:0.98654
[22]	validation_0-auc:0.98723
[23]	validation_0-auc:0.98723
[24]	validation_0-auc:0.98774
[25]	validation_0-auc:0.98769
[26]	validation_0-auc:0.98784
[27]	validation_0-auc:0.98803
[28]	validation_0-auc:0.98808
[29]	validation_0-auc:0.98803
[30]	validation_0-auc:0.98803
[31]	validation_

Pipeline(memory=None,
         steps=[('preprocessor',
                 Pipeline(memory=None,
                          steps=[('columntransformer',
                                  ColumnTransformer(n_jobs=None,
                                                    remainder='drop',
                                                    sparse_threshold=0.3,
                                                    transformer_weights=None,
                                                    transformers=[('minmaxscaler',
                                                                   MinMaxScaler(copy=True,
                                                                                feature_range=(0,
                                                                                               1)),
                                                                   Index([                            'nb_products',
                                 ('ASP', 'mean'),
                         

In [56]:
print( log_loss(y, pipe.predict_proba(X)[:,1]))

0.058641516936489044


* Tuneo de Profundidad del arbol y ...

In [None]:
clf = cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=370, 
              max_depth=5,
              min_child_weight=1, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       { 
           'max_depth':range(3,4,2), 
           'min_child_weight':range(1,2,1)
       }
      )


* Tuneo de gamma

In [None]:
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=370, 
              max_depth=4,
              min_child_weight=4, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       { 
           'gamma':[i/10.0 for i in range(0,5)]
       }
      )

* Tuneo de porcion del datset de entrenamiento usado para entrenar cada arbol y porcentaje de columnas

In [None]:
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=370, 
              max_depth=4,
              min_child_weight=4, 
              gamma=0, 
              subsample=0.8, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
       {
 'subsample':[i/10.0 for i in range(6,11)],
 'colsample_bytree':[i/10.0 for i in range(6,11)]
}
      )

* Tuneo de alpha para regularizacion L2

In [None]:
cv_xgb(XGBClassifier(learning_rate =0.1, 
              n_estimators=370, 
              max_depth=4,
              min_child_weight=4, 
              gamma=0, 
              subsample=0.7, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27),
             {'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]}
      )

# Pipeline

In [None]:
X_train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)
clf = XGBClassifier(learning_rate =0.1, 
              n_estimators=370, 
              max_depth=4,
              min_child_weight=4, 
              gamma=0, 
              subsample=0.7, 
              colsample_bytree=0.8,
              objective= 'binary:logistic', 
              nthread=4, 
              seed=27)
#clf = LogisticRegression(max_iter=1000)
pipe = create_pipe(clf)
pipe.fit(X,y)
#, eval_set= [(X_test, y_test)],early_stopping_rounds= 50)
print( log_loss(y_train, clf.predict_proba(X_train)[:,1]))
( log_loss(y, pipe.predict_proba(X)[:,1]))

# XGB sin tuning

In [None]:
### fit clf no training data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7)
clf = XGBClassifier(gamma=.1, 
                        n_estimators=200, 
                        subsample=.9,
                        colsample_bytree=.9,
                        eval_metric='logloss')
clf.fit(X_train,y_train)
#print(clf.score(X_train,y_train))
print( log_loss(y_train, clf.predict_proba(X_train)[:,1]))
print( log_loss(y_test, clf.predict_proba(X_test)[:,1]))

In [None]:
xgb.plot_importance(clf, max_num_features=20, importance_type='weight')

In [None]:
scores = -1 * cross_val_score(xgb.XGBClassifier(), X, y,
                              cv=10,
                              scoring='neg_log_loss')
print(f'mean: {scores.mean()} std: {scores.std()}')

# Logistic Classifier

In [None]:
clf = LogisticRegression(max_iter=400)
scores = -1 * cross_val_score(clf, X, y,
                              cv=10,
                              scoring='neg_log_loss')
print(f'mean: {scores.mean()} std: {scores.std()}')

# Prepare results to upload

In [65]:
y_pred = pipe.predict_proba(X_val)[:,1]
results = pd.DataFrame(data = {'Opportunity_ID': X_val.index, 'score': y_pred})   
results.to_csv(f'results/result{datetime.now()}.csv', index=False, header=False)