In [1]:
import logging
from typing import Dict, List

import numpy as np
import pandas as pd
import yaml
from scipy.stats.mstats import normaltest
from scipy.stats import boxcox
from sklearn.feature_selection import f_classif
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, FunctionTransformer

In [2]:
def remove_unnecessary_columns(
    df: pd.DataFrame,
    delete_columns: Dict
) -> pd.DataFrame:
    """
    Removes unnecessary columns from a DataFrame based on the
    specified criteria.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - delete_columns (Dict): A dictionary containing information
      about columns to delete.
        - 'selected_columns' (List[str]): List of column names to be deleted.
        - 'threshold' (float): Threshold for missing data. Columns with
          missing data exceeding this threshold will be deleted.

    Returns:
    - pd.DataFrame: The DataFrame with unnecessary columns removed.
    """
    df.drop(columns=delete_columns['selected_columns'], inplace=True)

    missing_th = int((1 - delete_columns['threshold']) * len(df)) + 1

    missing_data_cols = [
        col for col in df.columns.tolist() if df[col].count() < missing_th
    ]
    logger = logging.getLogger(__name__)
    if len(missing_data_cols) > 0:
        df.drop(columns=missing_data_cols, inplace=True)
        logger.info("Incomplete deleted columns: ", missing_data_cols)
    else:
        logger.info("There are not deleted columns")

    return df


def remove_incomplete_rows(
    df: pd.DataFrame
) -> pd.DataFrame:
    missing_data_cols = [
        col for col in df.columns.tolist() if len(
            df[pd.isnull(df[col])]
        ) > 0
    ]
    for column in missing_data_cols:
        null_values = df[pd.isnull(df[column])].index.tolist()
        df.drop(null_values, axis=0, inplace=True)
        # print(column, '\t', len(null_values), '\t', df[column].dtypes)
    print("Incomplete deleted rows: ", missing_data_cols)
    return df



def treat_skewed_columns(
    df: pd.DataFrame,
    skewed_columns: Dict
) -> pd.DataFrame:
    """
    Treats skewed numerical columns in a DataFrame using specified
    transformation methods.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - skewed_columns (Dict): A dictionary containing information about columns
      and transformation methods.
        - 'exclude_columns' (List[str]): List of column names to be excluded
          from skewness treatment.
        - 'method' (str): Transformation method. Options: "boxcox" or "log".

    Returns:
    - pd.DataFrame: The DataFrame with treated skewed columns.
    """
    mask_float = df.dtypes == np.float64
    float_cols = df.columns[mask_float].tolist()
    mask_int = df.dtypes == np.int64
    int_cols = df.columns[mask_int].tolist()
    numerical_cols = float_cols + int_cols

    if len(numerical_cols) > 0 and len(skewed_columns['exclude_columns']) > 0:
        for column in skewed_columns['exclude_columns']:
            numerical_cols.remove(column)

    transformed_columns = []
    if skewed_columns['method'] == "boxcox":
        boxcox_dict = {}
        for col in numerical_cols:
            print(col, min(df[col]))
            df[col].fillna(0, inplace=True)
            boxcox_current, lam = boxcox(df[col])
            boxcox_dict.update({f"{col}": [boxcox_current, lam]})
            df[col] = boxcox_current
    elif skewed_columns['method'] == "log":
        for col in numerical_cols:
            p_value = normaltest(df[col].values)[1]
            if p_value > 0.05:
                # print(col, p_value)
                if df[col].min() >= 0:
                    df[col] = (df[col] + 1).transform(np.log)
                else:
                    df[col] = (df[col] - df[col].min() + 1).transform(np.log)
                transformed_columns.append(col)

    if len(transformed_columns) > 0:
        print("Transformed columns: ", transformed_columns)
    else:
        print("There are not transformed columns")

    return df



def _get_anova_fvalue(
    x: pd.DataFrame,
    y: pd.Series
) -> pd.DataFrame:
    # Entre mayor sea el f1, quiere decir que la media entre las
    # clases 0 y 1 de attrition, tiene una mayor variabilidad,
    # lo que quiere decir que esa variable si importa en en an?lisis
    f_scores = f_classif(x, y)[0]  # el [1] son los p-values.
    df_fscores = pd.DataFrame({'features': x.columns, 'score': f_scores})
    df_fscores = df_fscores.sort_values('score', ascending=False)

    return df_fscores


def _get_correlations(
    data: pd.DataFrame,
    threshold: float
) -> pd.DataFrame:
    xcorr = data.corr().abs()
    xcorr = xcorr[xcorr > threshold].fillna(0)
    column1 = []
    column2 = []
    for idx in list(xcorr.index):
        for col in list(xcorr.columns):
            # la matriz es diagonal
            if idx == col:
                break
            if (xcorr.loc[idx, col] != 0):
                column1 = column1 + [idx]
                column2 = column2 + [col]
    df_fcorr = pd.DataFrame({'column1': column1, 'column2': column2})
    return df_fcorr


def _remove_columns_by_correlation(
    x: pd.DataFrame,
    df_most_correlated_cols: pd.DataFrame,
    df_anova_fscores: pd.DataFrame
) -> pd.DataFrame:
    for idx in df_most_correlated_cols.index:
        column1 = df_most_correlated_cols.loc[idx, 'column1']
        column2 = df_most_correlated_cols.loc[idx, 'column2']
        score_column1 = df_anova_fscores.loc[
            df_anova_fscores['features'] == column1, 'score'
        ].ravel()
        score_column2 = df_anova_fscores.loc[
            df_anova_fscores['features'] == column2, 'score'
        ].ravel()
        if score_column1 > score_column2:
            df_most_correlated_cols.loc[idx, 'drop'] = column2
        else:
            df_most_correlated_cols.loc[idx, 'drop'] = column1
    drop_features = list(df_most_correlated_cols['drop'].unique())
    print("removed by correlation: ", drop_features)
    df_removed_columns = x.drop(columns=drop_features, axis=1)
    return df_removed_columns


def _remove_columns_by_fvalue(
    df_clean1: pd.DataFrame,
    df_anova_fscores: pd.DataFrame,
    threshold: float
) -> pd.DataFrame:
    df_anova_fscores = df_anova_fscores[df_anova_fscores['score'] > threshold]
    df_removed_columns = df_clean1[df_anova_fscores['features']]
    return df_removed_columns


def feature_selection_correlation_anova(
    df_encoded_data: pd.DataFrame,
    target: str, threshold: Dict
) -> pd.DataFrame:
    """
    Performs feature selection based on correlation and ANOVA F-value
    criteria.

    Parameters:
    - df_encoded_data (pd.DataFrame): The input DataFrame with encoded
      features.
    - target (str): The name of the target variable.
    - threshold (Dict): A dictionary containing threshold values for
      feature selection.
        - 'corr_threshold' (float): Threshold for correlation coefficient.
        - 'fvalue_threshold' (float): Threshold for ANOVA F-value.

    Returns:
    - pd.DataFrame: The DataFrame with selected features based on
      correlation and ANOVA F-value.
    """
    x = df_encoded_data.drop(columns=[target])
    y = df_encoded_data[target]

    df_anova_fscores = _get_anova_fvalue(x, y)
    df_most_correlated_cols = _get_correlations(
        x, threshold['corr_threshold']
    )
    df_clean1 = _remove_columns_by_correlation(
        x, df_most_correlated_cols, df_anova_fscores
    )
    df_model_input = _remove_columns_by_fvalue(
        df_clean1, df_anova_fscores, threshold['fvalue_threshold']
    )
    df_model_input[target] = y

    return df_model_input

In [3]:
from typing import Dict, Tuple, Any

import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    classification_report, accuracy_score, precision_score,
    recall_score, make_scorer, confusion_matrix
)
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns


def split_data(
    df: pd.DataFrame,
    target_variable: str,
    model_options_lg: Dict
) -> Tuple:
    y = df[target_variable]
    x = df.drop(columns=[target_variable])

    strat_shuf_split = StratifiedShuffleSplit(
        n_splits=1, test_size=model_options_lg['test_size'],
        random_state=model_options_lg['random_state']
    )

    train_idx, test_idx = next(strat_shuf_split.split(x, y))
    x_train = df.loc[train_idx, x.columns]
    y_train = df.loc[train_idx, target_variable]
    x_test = df.loc[test_idx, x.columns]
    y_test = df.loc[test_idx, target_variable]

    return x_train, y_train, x_test, y_test


def train_model(
    x_train: pd.DataFrame,
    y_train: pd.Series,
    model_options_lg: Dict
) -> Any:
    skf = StratifiedKFold(shuffle=True,
                          random_state=model_options_lg['random_state'],
                          n_splits=model_options_lg['n_splits'])

    ss = StandardScaler()

    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='macro'),
        'recall': make_scorer(recall_score, average='macro'),
        'f1': make_scorer(f1_score, average='macro')
    }

    if model_options_lg['model'] == "LogisticRegression":
        lreg = LogisticRegression()

        estimator = Pipeline([
            # ("polynomial_features", PolynomialFeatures()),
            ("scaler", ss),
            ("logistic_regression", lreg)])

        params = {
            # 'polynomial_features__degree': [1, 2, 3],
            'logistic_regression__penalty': ['l1', 'l2'],
            'logistic_regression__C': [4, 6, 10],
            'logistic_regression__solver': ['liblinear']
        }

    elif model_options_lg['model'] == "SVC":
        svc = SVC()

        estimator = Pipeline([
            # ("polynomial_features", PolynomialFeatures()),
            ("scaler", ss),
            ("svc_classifier", svc)])

        params = {
            # 'polynomial_features__degree': [1, 2,3],
            'svc_classifier__C': [2, 4, 6],
            'svc_classifier__kernel': ['rbf', 'sigmoid']
        }
    elif model_options_lg['model'] == "RandomForest":
        rf = RandomForestClassifier()

        estimator = Pipeline([
            # ("polynomial_features", PolynomialFeatures()),
            ("scaler", ss),
            ("RF_classifier", rf)])

        params = {
            # 'polynomial_features__degree': [1, 2,3],
            'RF_classifier__n_estimators': [350, 400, 450],
            'RF_classifier__max_depth': [None, 20],
            'RF_classifier__warm_start': [True]
        }

    grid = GridSearchCV(
        estimator, params, scoring=scoring, refit='f1', cv=skf, n_jobs=-1
    )
    grid.fit(x_train, y_train)

    return grid


def evaluate_model(
    model: Any,
    x_test: pd.DataFrame,
    y_test: pd.Series
):
    score, params = model.best_score_, model.best_params_
    print("Best score: ", score)
    print("Best params: ", params)
    predictions = model.predict(x_test)
    print(classification_report(y_test, predictions))
    print(model.cv_results_['mean_test_f1'])
    cr = classification_report(y_test, predictions, output_dict=True)
    df_cr = pd.DataFrame(cr).iloc[:-1, :].T
    sns.heatmap(df_cr, annot=True)

    return plt

In [4]:
def load_yaml_file(file_path):
    with open(file_path, 'r') as file:
        data = yaml.safe_load(file)
    return data

# Example usage
yaml_file_path = 'C:/Users/luisg/Documents/projects/data_science_bank_churn/conf/base/parameters/data_processing.yml'
yaml_data = load_yaml_file(yaml_file_path)


In [5]:
yaml_data

{'target_variable': 'Attrition_Flag',
 'delete_columns': {'threshold': 0.05, 'selected_columns': ['CLIENTNUM']},
 'reduce_options_columns': {'threshold': 0.03, 'exclude': ['Attrition_Flag']},
 'outliers_columns': ['Customer_Age',
  'Dependent_count',
  'Months_on_book',
  'Total_Relationship_Count',
  'Months_Inactive_12_mon',
  'Contacts_Count_12_mon',
  'Credit_Limit',
  'Total_Revolving_Bal',
  'Avg_Open_To_Buy',
  'Total_Amt_Chng_Q4_Q1',
  'Total_Trans_Amt',
  'Total_Trans_Ct',
  'Total_Ct_Chng_Q4_Q1',
  'Avg_Utilization_Ratio'],
 'skewed_columns': {'method': 'log', 'exclude_columns': []},
 'feature_selection': {'corr_threshold': 0.9, 'fvalue_threshold': 1}}

In [135]:
df_data = pd.read_csv(
    'C:/Users/luisg/Documents/projects/data_science_bank_churn/data/01_raw/BankChurners.csv',
    usecols=['Attrition_Flag', 'Customer_Age', 'Gender',
       'Dependent_count', 'Education_Level', 'Marital_Status',
       'Income_Category', 'Card_Category', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio']
)


def transform_target(df, target_variable):
    df[target_variable] = df[target_variable].apply(
        lambda x: 0 if x == "Existing Customer" else 1
    )
    df.rename(columns={target_variable: "Attrition"}, inplace=True)
    return df


def handle_outliers(
    df: pd.DataFrame,
    outliers_columns: Dict
) -> pd.DataFrame:
    """
    Handles outliers in specified columns of a DataFrame based on statistical
    tests and boundary criteria.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - outliers_columns (Dict): A dictionary containing information about
      columns with outliers.
        - Keys: Column names with outliers.
        - Values: Not used. Can be an empty dictionary or any placeholder.

    Returns:
    - pd.DataFrame: The DataFrame with outliers removed.
    """
    print("Shape before removing: ", df.shape)
    transformed_columns = []

    for col in outliers_columns:
        p_value = normaltest(df[col].values)[1]
        if p_value < 0.05:
            uppper_boundary = df[col].mean() + 3 * df[col].std()
            lower_boundary = df[col].mean() - 3 * df[col].std()
        else:
            IQR = df[col].quantile(0.75) - df[col].quantile(0.25)
            lower_boundary = df[col].quantile(0.25) - (IQR * 1.5)
            uppper_boundary = df[col].quantile(0.75) + (IQR * 1.5)
        outliers = df[
            (df[col] < lower_boundary) | (df[col] > uppper_boundary)
        ].index.tolist()

        if len(outliers) > 0:
            df.drop(outliers, axis=0, inplace=True)
            transformed_columns.append((col, len(outliers)))

    df.reset_index(inplace=True, drop=True)
    if len(transformed_columns) > 0:
        print("Outliers deleted: ", transformed_columns)
        print("Shape after removing: ", df.shape)
    else:
        print("There are not outliers")
    return df


def reduce_categorical_column_options(
        df: pd.DataFrame,
        reduce_columns: Dict
) -> pd.DataFrame:
    """
    Reduces the number of options in categorical columns of a DataFrame based
    on specified criteria.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - reduce_columns (Dict): A dictionary containing information about columns
      to be reduced.
        - 'threshold' (float): Threshold for reducing categories. Categories
          with a frequency below this threshold will be replaced with
          'others'.
        - 'colums' (List): columns names

    Returns:
    - pd.DataFrame: The DataFrame with reduced categorical options.
    """
    df_data_red = df.copy()
    combined_col_names = []
    for col in reduce_columns['columns']:
        val_counts = df[col].value_counts()
        replace_cats = list(
            val_counts[
                (val_counts / val_counts.sum()) < reduce_columns['threshold']
            ].index
        )
        if len(replace_cats) > 0:
            df_data_red[col] = df_data_red.replace(replace_cats, 'others')[col]
            combined_col_names.append(col)

    if len(combined_col_names) > 0:
        print("Reduced columns: ", combined_col_names)
    else:
        print("Not reduced columns")

    return df_data_red


def get_preprocessor(
    df: pd.DataFrame
) -> pd.DataFrame:
    """
    Encodes categorical columns in a DataFrame using various encoding
    techniques.

    Parameters:
    - df (pd.DataFrame): The input DataFrame.
    - target_variable (str): The name of the target variable for binary
      encoding.

    Returns:
    - pd.DataFrame: The DataFrame with encoded categorical columns.
    """
    target_variable = 'Attrition'
    # y = df[target_variable]
    x = df.drop(columns=[target_variable])


    education_order = ['Unknown', 'Uneducated', 'High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate']
    income_order = ["Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"]


    # Create a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('Custom_education', OrdinalEncoder(categories=[education_order]), ['Education_Level']),
            ('Custom_income', OrdinalEncoder(categories=[income_order]), ['Income_Category']),
            ('MinMax', MinMaxScaler(), ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Trans_Amt']),
            ('Ordinal', OrdinalEncoder(), ['Marital_Status', 'Gender']),
            ('onehot', OneHotEncoder(), ['Card_Category'])
        ],
        remainder='passthrough', # Leave the other columns unchanged

    )

    # Label encoder
    preprocessor.fit(x) 

    return preprocessor



# Data preprocessing - COMPLETE DATASET (BEFORE SPLIT)
df_data = transform_target(df_data, 'Attrition_Flag')

# Data preprocessing - COMPLETE DATASET (AFTER SPLIT) only on train set
df_data = handle_outliers(df_data, ['Customer_Age',
  'Dependent_count',
  'Months_on_book',
  'Total_Relationship_Count',
  'Months_Inactive_12_mon',
  'Contacts_Count_12_mon',
  'Credit_Limit',
  'Total_Revolving_Bal',
  'Avg_Open_To_Buy',
  'Total_Amt_Chng_Q4_Q1',
  'Total_Trans_Amt',
  'Total_Trans_Ct',
  'Total_Ct_Chng_Q4_Q1',
  'Avg_Utilization_Ratio']
)
# df_data = reduce_categorical_column_options(
#     df_data,
#     {
#         'columns': ['Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'],
#         'threshold': 0.03
#     }
# )

preprocessor = get_preprocessor(df_data)
df_data.head()

Shape before removing:  (10127, 20)
Outliers deleted:  [('Customer_Age', 1), ('Months_Inactive_12_mon', 124), ('Contacts_Count_12_mon', 620), ('Total_Amt_Chng_Q4_Q1', 159), ('Total_Trans_Amt', 308), ('Total_Trans_Ct', 7), ('Total_Ct_Chng_Q4_Q1', 92)]
Shape after removing:  (8816, 20)


Unnamed: 0,Attrition,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,44,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1,2,4010.0,1247,2763.0,1.376,1088,24,0.846,0.311
1,0,42,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3,2,6748.0,1467,5281.0,0.831,1201,42,0.68,0.217
2,0,57,F,2,Graduate,Married,Less than $40K,Blue,48,5,2,2,2436.0,680,1756.0,1.19,1570,29,0.611,0.279
3,0,45,F,2,Graduate,Married,Unknown,Blue,37,6,1,2,14470.0,1157,13313.0,0.966,1207,21,0.909,0.08
4,1,62,F,0,Graduate,Married,Less than $40K,Blue,49,2,3,3,1438.3,0,1438.3,1.047,692,16,0.6,0.0


In [133]:
df_data.head()

Unnamed: 0,Attrition,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,0,44,M,2,Graduate,Married,$40K - $60K,Blue,36,3,1,2,4010.0,1247,2763.0,1.376,1088,24,0.846,0.311
1,0,42,M,5,Uneducated,Unknown,$120K +,Blue,31,5,3,2,6748.0,1467,5281.0,0.831,1201,42,0.68,0.217
2,0,57,F,2,Graduate,Married,Less than $40K,Blue,48,5,2,2,2436.0,680,1756.0,1.19,1570,29,0.611,0.279
3,0,45,F,2,Graduate,Married,Unknown,Blue,37,6,1,2,14470.0,1157,13313.0,0.966,1207,21,0.909,0.08
4,1,62,F,0,Graduate,Married,Less than $40K,Blue,49,2,3,3,1438.3,0,1438.3,1.047,692,16,0.6,0.0


In [18]:
# df_data.Card_Category.unique()

In [19]:
# df_data.info()

In [10]:
df_data.describe()

Unnamed: 0,Attrition,Customer_Age,Dependent_count,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0,8816.0
mean,0.167083,46.316357,2.351293,35.898707,3.830649,2.299342,2.499433,8438.061207,1149.609347,7288.45186,0.741933,4126.729923,64.345168,0.694257,0.278638
std,0.373071,7.948977,1.29824,7.914192,1.557597,0.93152,0.929628,9019.391051,815.940024,9025.217553,0.18031,2776.243793,21.796597,0.195916,0.278417
min,0.0,26.0,0.0,13.0,1.0,0.0,1.0,1438.3,0.0,3.0,0.12,510.0,10.0,0.038,0.0
25%,0.0,41.0,1.0,32.0,3.0,2.0,2.0,2494.75,168.0,1251.5,0.625,2199.75,46.0,0.574,0.0105
50%,0.0,46.0,2.0,36.0,4.0,2.0,3.0,4358.5,1259.0,3304.5,0.73,3915.0,67.0,0.694,0.1795
75%,0.0,52.0,3.0,40.0,5.0,3.0,3.0,10678.25,1769.0,9528.5,0.848,4691.25,80.0,0.811,0.513
max,1.0,70.0,5.0,56.0,6.0,5.0,4.0,34516.0,2517.0,34516.0,1.4,14807.0,130.0,1.37,0.999


In [20]:
# for column in df_data.columns:
#     print(column, df_data[column].unique())

In [21]:
# yaml_data

In [22]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder, FunctionTransformer, OrdinalEncoder


# df = df_data.copy()
# target_variable = 'Attrition'
# y = df[target_variable]
# x = df.drop(columns=[target_variable])


# education_order = ['Unknown', 'Uneducated', 'High School', 'College', 'Graduate', 'Post-Graduate', 'Doctorate']
# income_order = ["Unknown", "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +"]


# # Create a ColumnTransformer
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('Custom_education', OrdinalEncoder(categories=[education_order]), ['Education_Level']),
#         ('Custom_income', OrdinalEncoder(categories=[income_order]), ['Income_Category']),
#         ('MinMax', MinMaxScaler(), ['Customer_Age', 'Months_on_book', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Trans_Amt']),
#         ('Ordinal', OrdinalEncoder(), ['Marital_Status', 'Gender']),
#         ('onehot', OneHotEncoder(), ['Card_Category'])
#     ],
#     remainder='passthrough', # Leave the other columns unchanged

# )

# # Label encoder
# preprocessor.fit(x) 
# preprocessor

### Model Training

In [23]:

# def load_yaml_file(file_path):
#     with open(file_path, 'r') as file:
#         data = yaml.safe_load(file)
#     return data

# # Example usage
# yaml_file_path = 'C:/Users/luisg/Documents/projects/data_science_bank_churn/conf/base/parameters/data_science.yml'
# yaml_training_data = load_yaml_file(yaml_file_path)

In [24]:
# yaml_training_data

In [156]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier


def split_dataset(df, preprocessor):
    target_variable = 'Attrition'
    y = df[target_variable]
    x = df.drop(columns=[target_variable])
    strat_shuf_split = StratifiedShuffleSplit(
        n_splits=1, test_size=0.2,
        random_state=42
    )

    train_idx, test_idx = next(strat_shuf_split.split(x, y))
    x_train = x.iloc[train_idx, :]
    y_train = y[train_idx]
    x_test = x.loc[test_idx, :]
    y_test = y[test_idx]
    x_train = preprocessor.fit_transform(x_train)
    x_test = preprocessor.transform(x_test)

    return x_train, y_train, x_test, y_test


def evaluate_models(X_train, y_train, X_test, y_test, models, param):
    report = {}
    scoring = {
        'accuracy': make_scorer(accuracy_score),
        'precision': make_scorer(precision_score, average='macro'),
        'recall': make_scorer(recall_score, average='macro'),
        'f1': make_scorer(f1_score, average='macro')
    }
    skf = StratifiedKFold(shuffle=True, random_state=42, n_splits=3)

    for i in range(len(list(models))):
        model = list(models.values())[i]
        para=param[list(models.keys())[i]]

        gs = GridSearchCV(model, para, cv=skf, scoring=scoring, refit='f1')
        gs.fit(X_train,y_train)

        model.set_params(**gs.best_params_)
        model.fit(X_train,y_train)

        # y_train_pred = model.predict(X_train)

        y_test_pred = model.predict(X_test)


        report[list(models.keys())[i]] = {
            'accuracy': accuracy_score(y_test, y_test_pred),
            'precision': precision_score(y_test, y_test_pred),
            'recall': recall_score(y_test, y_test_pred),
            'f1': f1_score(y_test, y_test_pred)
        }

    return report


def train_model(x_train, y_train, x_test, y_test):
    
    models = {
        "Logistic Regression": LogisticRegression(),
        "KNeighbors Classifier": KNeighborsClassifier(),
        "Support Vector Machine": SVC(),
        "Random Forest": RandomForestClassifier(),
        "GradientBoosting Classifier": GradientBoostingClassifier(),
        "AdaBoost Classifier": AdaBoostClassifier(),
        "XGB Classifier": XGBClassifier()
    }
    params={
        "Logistic Regression": {
            'penalty':['l2', 'l1'],
            'solver':['liblinear']
        },
        "KNeighbors Classifier":{
            'n_neighbors':[5, 7],
            'weights': ['uniform', 'distance']
        },
        "Support Vector Machine":{
            'kernel':['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        },
        "Random Forest":{
            'n_estimators': [100, 200]
        },
        "GradientBoosting Classifier":{
            'n_estimators': [100, 200]
        },
        "AdaBoost Classifier":{
            'n_estimators': [100, 200]
        },
        "XGB Classifier":{
            'n_estimators': [100, 200]
        }
        
    }

    model_report = evaluate_models(X_train=x_train, y_train=y_train, X_test=x_test, y_test=y_test,
                                        models=models, param=params)
    
    # ## To get best model score from dict
    best_model_name, _ = sorted([(model, score) for model, scores in model_report.items() for metric, score in scores.items() if metric == 'f1'], reverse=True, key= lambda x: x[1])[0]

    best_model = models[best_model_name]
    # return model_report, best_model_name, best_model
    return best_model

x_train, y_train, x_test, y_test = split_dataset(df_data, preprocessor)
# report, best_model_name, best_model = train_model(x_train, y_train, x_test, y_test)
# best_model = train_model(x_train, y_train, x_test, y_test)

In [142]:
report

{'Logistic Regression': {'accuracy': 0.8990929705215419,
  'precision': 0.7532467532467533,
  'recall': 0.5898305084745763,
  'f1': 0.6615969581749049},
 'KNeighbors Classifier': {'accuracy': 0.8696145124716553,
  'precision': 0.6470588235294118,
  'recall': 0.4847457627118644,
  'f1': 0.5542635658914729},
 'Support Vector Machine': {'accuracy': 0.9013605442176871,
  'precision': 0.784037558685446,
  'recall': 0.5661016949152542,
  'f1': 0.6574803149606299},
 'Random Forest': {'accuracy': 0.9563492063492064,
  'precision': 0.9128787878787878,
  'recall': 0.8169491525423729,
  'f1': 0.8622540250447227},
 'GradientBoosting Classifier': {'accuracy': 0.9671201814058957,
  'precision': 0.9128919860627178,
  'recall': 0.888135593220339,
  'f1': 0.9003436426116839},
 'AdaBoost Classifier': {'accuracy': 0.9586167800453514,
  'precision': 0.8775510204081632,
  'recall': 0.8745762711864407,
  'f1': 0.8760611205432938},
 'XGB Classifier': {'accuracy': 0.969954648526077,
  'precision': 0.908783783

In [138]:
best_model_name

'XGB Classifier'

In [143]:
f1_score(y_test, best_model.predict(x_test))

0.910321489001692

In [149]:
x_train.shape

(7052, 22)

In [131]:
# best_model.get_params

In [35]:
from pathlib import Path

file_path = Path.cwd()
model_path = file_path.parent / "data" / "06_models" / "model_notebook.pkl"
preprocessor_path = file_path.parent / "data" / "06_models" / "preprocessor.pkl"
model_path

WindowsPath('c:/Users/luisg/Documents/projects/data_science_bank_churn/data/06_models/model_notebook.pkl')

In [37]:
import pickle

# Save the model using pickle
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

# Save the preprocessor using pickle
with open(preprocessor_path, 'wb') as file:
    pickle.dump(preprocessor, file)

In [38]:
import pickle


# Now you can load the model later if needed
with open(model_path, 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can load the model later if needed
with open(preprocessor_path, 'rb') as file:
    preprocessor = pickle.load(file)

c:\Users\luisg\Documents\projects\data_science_bank_churn\data\06_models\model_notebook.pkl


In [185]:
x_test

Unnamed: 0,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
999,56,F,4,High School,Single,Less than $40K,Blue,36,3,1,3,8881.0,2145,6736.0,0.705,2027,49,0.815,0.242
1697,54,F,0,Graduate,Divorced,Less than $40K,Silver,36,4,3,1,12547.0,1378,11169.0,1.120,3360,56,0.750,0.110
653,26,M,0,Graduate,Single,Unknown,Blue,19,4,1,2,1438.3,0,1438.3,0.472,2005,47,0.469,0.000
3863,51,M,2,High School,Married,$60K - $80K,Blue,46,5,3,2,4123.0,1760,2363.0,0.869,2447,40,0.429,0.427
271,46,F,4,Graduate,Divorced,Less than $40K,Blue,36,5,3,1,9478.0,820,8658.0,0.640,1327,41,0.864,0.087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4616,39,F,1,Uneducated,Single,Less than $40K,Blue,34,3,1,4,6904.0,1148,5756.0,0.649,3485,84,0.615,0.166
2784,46,F,2,High School,Unknown,Unknown,Blue,32,4,1,3,7966.0,1423,6543.0,0.987,3780,75,0.923,0.179
3136,43,M,5,Graduate,Unknown,$40K - $60K,Blue,38,3,3,2,7599.0,1100,6499.0,0.365,4401,71,0.651,0.145
1428,37,F,3,Graduate,Single,Less than $40K,Blue,36,5,2,4,3685.0,1396,2289.0,0.974,3016,85,0.809,0.379


In [186]:
predictions = loaded_model.predict(x_test)

In [187]:
y_test

999     0
1697    0
653     0
3863    1
271     0
       ..
4616    0
2784    0
3136    0
1428    0
5635    0
Name: Attrition, Length: 1764, dtype: int64

In [188]:
accuracy_score(y_test, predictions)

0.8979591836734694

In [189]:
y_train

7322    1
5747    0
6555    0
7892    1
2816    0
       ..
4074    0
7739    0
668     0
1430    0
55      0
Name: Attrition, Length: 7052, dtype: int64

In [190]:
f1_score(y_train, loaded_model.predict(x_train))

0.6544930321960596

In [191]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1469
           1       0.75      0.59      0.66       295

    accuracy                           0.90      1764
   macro avg       0.83      0.77      0.80      1764
weighted avg       0.89      0.90      0.89      1764



In [192]:
confusion_matrix(y_test, predictions)

array([[1411,   58],
       [ 122,  173]], dtype=int64)

In [4]:
import pandas as pd


df_prueba = pd.read_csv('C:/Users/luisg/Documents/projects/data_science_bank_churn/data/01_raw/BankChurners.csv')
df_prueba = df_prueba.head(1)
df_prueba.drop(columns=['Attrition_Flag'], inplace=True)
# df_prueba.rename(columns={"Attrition_Flag": "Attrition"}, inplace=True)
df_prueba.head()

Unnamed: 0,CLIENTNUM,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061


In [1]:
print(len(df_prueba.columns))

NameError: name 'df_prueba' is not defined

In [6]:
# df_prueba = remove_unnecessary_columns(df_prueba, yaml_data['delete_columns'])
df_prueba.drop(columns=['CLIENTNUM'], inplace = True)
df_prueba.reset_index(inplace=True, drop=True)
df_prueba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 19 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Customer_Age              1 non-null      int64  
 1   Gender                    1 non-null      object 
 2   Dependent_count           1 non-null      int64  
 3   Education_Level           1 non-null      object 
 4   Marital_Status            1 non-null      object 
 5   Income_Category           1 non-null      object 
 6   Card_Category             1 non-null      object 
 7   Months_on_book            1 non-null      int64  
 8   Total_Relationship_Count  1 non-null      int64  
 9   Months_Inactive_12_mon    1 non-null      int64  
 10  Contacts_Count_12_mon     1 non-null      int64  
 11  Credit_Limit              1 non-null      float64
 12  Total_Revolving_Bal       1 non-null      int64  
 13  Avg_Open_To_Buy           1 non-null      float64
 14  Total_Amt_Chng

In [7]:
len(df_prueba.values.tolist()[0])

19

In [8]:
# df_prueba.drop(columns=['CLIENTNUM'], inplace= True)
# data_scaled=preprocessor.transform(df_prueba)

result_dict = {0: "No Churn", 1: "Churn"}
result = loaded_model.predict(df_prueba)
{'result': result_dict[result[0]]}

{'result': 'No Churn'}

In [239]:
preprocessor

ColumnTransformer(remainder='passthrough',
                  transformers=[('Custom_education',
                                 FunctionTransformer(func=<function transform_education_level at 0x0000027BD60BC598>),
                                 ['Education_Level']),
                                ('Custom_income',
                                 FunctionTransformer(func=<function transform_income_category at 0x0000027BD60BC0D0>),
                                 ['Income_Category']),
                                ('MinMax', MinMaxScaler(),
                                 ['Customer_Age', 'Months_on_book',
                                  'Credit_Limit', 'Total_Revolving_Bal',
                                  'Avg_Open_To_Buy', 'Total_Trans_Amt']),
                                ('Ordinal', OrdinalEncoder(),
                                 ['Marital_Status', 'Gender']),
                                ('onehot', OneHotEncoder(), ['Card_Category'])])