# Client attrition - model for classification

Maciej Lorens and Pola Parol

In [1]:
import pandas as pd

# SKLEARN
from sklearn.impute import SimpleImputer

from sklearn.feature_selection import (
    SelectFromModel
)

from sklearn.compose import ColumnTransformer

# GRADIENT BOOSTING
import xgboost as xgb

# CATEGORY ENCODERS
from category_encoders import (
    OneHotEncoder
)

# TREATING IMBALANCE
import imblearn

In [3]:
credit_train = pd.read_csv("data/client_attrition_train.csv")
credit_test = pd.read_csv("data/client_attrition_test.csv")

credit_train = credit_train.drop(['customer_id'], axis = 1)
credit_test = credit_test.drop(['customer_id'], axis = 1)

def get_cats(df):
    for col in ['customer_sex', 'customer_education', 'customer_civil_status', 'customer_salary_range', 'credit_card_classification']:
        df[col] = df[col].astype('category')

get_cats(credit_train)
get_cats(credit_test)

credit_train['account_status'] = credit_train['account_status'].map({'closed': 1, 'open': 0})

X_train = credit_train.drop(['account_status'], axis=1)
y_train = credit_train['account_status']

ct = ColumnTransformer([
    ('onehot', OneHotEncoder(handle_unknown='indicator', handle_missing='indicator'),
     ['customer_salary_range', 
      'customer_education',
      'customer_civil_status',
      'customer_sex',
      'credit_card_classification'
      ]),
     ('num_imputer', SimpleImputer(strategy='mean'), 
      ['total_transaction_amount']),
      ('cat_imputer', SimpleImputer(strategy='most_frequent'),
       ['customer_age'])
    ],
    remainder='passthrough')

xgb_model = xgb.XGBClassifier(**{'n_estimators': 100, 
                                 'learning_rate': 0.16474046382972407, 
                                 'gamma': 0.5124082846922701, 
                                 'max_depth': 10, 
                                 'min_child_weight': 2, 
                                 'subsample': 0.8075185211385638, 
                                 'colsample_bytree': 0.7511935269307113, 
                                 'reg_alpha': 0.8418473994077464, 
                                 'reg_lambda': 0.4909430329081276}, n_jobs=-1, random_state=1)

resample = imblearn.combine.SMOTETomek(tomek=imblearn.under_sampling.TomekLinks(sampling_strategy='majority'), random_state = 1)

fs = SelectFromModel(xgb_model, threshold=1e-2)

opt_xgb = imblearn.pipeline.Pipeline([('preprocessing', ct),
                                      ('tomek', resample),
                                      ('feature_select', fs),
                                      ('model', xgb_model)])

opt_xgb.fit(X_train, y_train)

preds = opt_xgb.predict(credit_test)

preds = ['closed' if x==1 else 'open' for x in preds]

pd.DataFrame(preds, columns=['prediction']).to_csv('classification_prediction.csv', index=False)