## FINAL MODEL 

Polaczenie LightBgm z Catboost za pomoca soft VoteClassifiera

In [37]:
from ColumnTransformers import * 
from AdvModels import *
from sklearn.metrics import recall_score, f1_score,precision_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns 


In [2]:

class CatBoostWithCatFeatures(CatBoostClassifier):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.cat_features_indices = None
    
    def fit(self, X, y=None, **fit_params):
        X_mod=self._preprocess(X)
        X_mod[:, self.cat_features_indices] = X_mod[:, self.cat_features_indices].astype(str)
        
        super().fit(X_mod, y, cat_features=self.cat_features_indices, **fit_params)
        return self

    def _preprocess(self, X):
        if self.cat_features_indices is None:
            self.cat_features_indices = GetCategorical(X)
        X_mod = X.copy()
        X_mod[:, self.cat_features_indices] = X_mod[:, self.cat_features_indices].astype(str)
        return X_mod

    def predict(self, X, **kwargs):
        X_mod = self._preprocess(X)
        return super().predict(X_mod, **kwargs)

    def predict_proba(self, X, **kwargs):
        X_mod = self._preprocess(X)
        return super().predict_proba(X_mod, **kwargs)
  
def CatBoostPipeline(X):
    catboost_pipeline = Pipeline([
    ("preprocessor", CatBoostTransformer(
        Numerical=['Transaction.Amount', 'Customer.Age','Account.Age.Days','Quantity']
        )),
    ("model", CatBoostWithCatFeatures(
        grow_policy="SymmetricTree",
        rsm=0.8,
        depth=9,
        auto_class_weights="Balanced",
        learning_rate=0.01,
        l2_leaf_reg=2,
        iterations=1500,
        border_count=256,
        verbose=0 ))])
    return catboost_pipeline

def LGBMClassifierPipeline(): 
    model=LGBMClassifier(
        colsample_bytree=0.8,
        is_unbalance=True,
        learning_rate=0.01,
        max_depth=2,
        min_split_gain=0.1,
        n_estimators=600,
        reg_lambda=1,
        subsample=1,
        verbosity=-1
    )
    result=PipelineModel(model,n=18)
    return result

def create_voting_classifier(X):
    
    catboost_pipeline = CatBoostPipeline(X)
    lgbm_pipeline = LGBMClassifierPipeline() 
    print(type(lgbm_pipeline))
    voting_ensemble = VotingClassifier(
        estimators=[
            ('catboost', catboost_pipeline),
            ('lgbm',     lgbm_pipeline)
        ],
        voting='soft'
    )
    return voting_ensemble

In [48]:
def lgbmImportanceGetter(pipeline): 
    
    feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()

    selector = pipeline.named_steps["featureselection"]
    support_mask = selector.support_
    selected_features = feature_names[support_mask]

    model = pipeline.named_steps["model"]
    importances = model.feature_importances_  
    feature_importance_df = pd.DataFrame({
    "Model":"LGBM",
    "Feature": selected_features,
    "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    return feature_importance_df

def catImportanceGetter(pipeline): 
    model = pipeline.named_steps["model"]
    importances = model.feature_importances_
    preprocessor = pipeline.named_steps["preprocessor"]
    feature_names = preprocessor.get_feature_names_out()
    feature_importance_df = pd.DataFrame({
    "Model":"CatBoost",
    "Feature": feature_names,
    "Importance": importances
    }).sort_values(by="Importance", ascending=False)
    return  feature_importance_df
    
def FinalImportance(X_train,y_train): 
    Lgbm=LGBMClassifierPipeline() 
    Cat=CatBoostPipeline(X_train) 
    Lgbm.fit(X_train,y_train)
    Cat.fit(X_train,y_train) 
    importanceslgbm=lgbmImportanceGetter(Lgbm) 
    #print(importanceslgbm) 
    importancesCat=catImportanceGetter(Cat)
    #print(importancesCat)
    return importancesCat,importanceslgbm

def ImportanceDataFrame(importancesCat,importanceslgbm): 
    df=pd.concat([importanceslgbm,importancesCat],axis=0).reset_index(drop=True)
    df['dense_rank'] = df.groupby('Model')['Importance'] \
                     .rank(method='first', ascending=False)
    pivot = df.pivot(index='dense_rank', columns='Model', values='Feature')
    return pivot


def MiniPlotFunction(y_scores,y_test):
    fpr, tpr, thresholds = roc_curve(y_test, y_scores)
    roc_auc = auc(fpr, tpr)
    aucPlot(fpr, tpr,roc_auc)
    RecallTresholdPlot(y_scores,y_test)
    
def TotalImportance(): 
    LgbmImportance,CatImportance=FinalImportance(X_train,y_train)
    ImportanceDataFrame=ImportanceDataFrame(LgbmImportance,CatImportance) 
    ImportanceDataFrame
    return ImportanceDataFrame,LgbmImportance,CatImportance


def RecallTresholdPlot(y_predict_proba,y_final): 
    tresholds=np.arange(0,1,0.01)
    RecallVector=np.zeros(100)
    F1Vector=np.zeros(100)
    PrecisionVector=np.zeros(100)
    AccuracyVector=np.zeros(100)
    for i in range(len(tresholds)): 
        y_pred=np.array(y_predict_proba>tresholds[i])
        RecallVector[i]=recall_score(y_final,y_pred)
        F1Vector[i]=f1_score(y_final,y_pred)
        PrecisionVector[i]=precision_score(y_final,y_pred)
        AccuracyVector[i]=accuracy_score(y_final,y_pred)
   
    
    fig, axs = plt.subplots(2, 2,figsize=(14, 8))
    axs[0,0].plot(tresholds,RecallVector) 
    axs[0,0].set_title("RECALL") 
    axs[0,0].grid() 
    
    axs[1,0].plot(tresholds,F1Vector) 
    axs[1,0].set_title("F1") 
    axs[1,0].grid() 
    
    axs[0,1].plot(tresholds,PrecisionVector) 
    axs[0,1].set_title("PRECISION")
    axs[0,1].grid() 
    
    axs[1,1].plot(tresholds,AccuracyVector) 
    axs[1,1].set_title("ACCURACY")
    axs[1,1].grid() 
    

    plt.show()
    
    

In [19]:
def FinalModel(X_train,X_test,y_train,y_test,threshold=0.5): 
    finalModel=create_voting_classifier(X_train)
    set_config(display='diagram')
    display(finalModel)
    finalModel.fit(X_train,y_train)
    y_scores=finalModel.predict_proba(X_test)[:, 1]  
    return y_scores,finalModel

In [43]:
X_train,y_train=getTestData()
X_test,y_test=getTrainingData()
y_scores,FinalModel=FinalModel(X_train,X_test,y_train,y_test)


<class 'sklearn.pipeline.Pipeline'>


KeyboardInterrupt: 

## OFFICIAL TEST

In [5]:
X_train,y_train=getTestData()
X_test,y_test=getTrainingData()
X_TRAIN = pd.concat([X_train, X_test]).reset_index(drop=True)
Y_TRAIN = pd.concat([y_train, y_test]).reset_index(drop=True)

In [None]:
X_TEST,Y_TEST=getValidationData()
FinalModel(X_TRAIN,X_TEST,Y_TRAIN,Y_TEST,threshold=0.5)