In [None]:
# hide
# default_exp L3A_user_model.py
# from nbdev.showdoc import *

# 03A user churn model

> packaging churn model, given records similar to user_profile (combined data and feature generated from various sources)

## Library

In [None]:
# Library
import pandas as pd
import numpy as np
import os
from zipfile import ZipFile
from scipy import spatial
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import seaborn as sns; sns.set_theme()
from collections import defaultdict

from tsfresh import extract_features
from tsfresh.feature_selection.relevance import calculate_relevance_table
import tsfresh

In [None]:
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib

## Functions

In [None]:
#exports
def load_directory_files_dict(dir_path)->dict:
    'Load all pkl files in the directory into dict'
    L1file_list = os.listdir(path_load)
    L1file_list = [i for i in L1file_list if not i.startswith(".") and i.endswith('.pkl')]
    L1name_list = [i.split("_")[0]+"_"+i.split("_")[1].replace(".pkl","") for i in L1file_list]

    dt = {}
    for name, key in zip(L1file_list, L1name_list):
        dt[key] = pd.read_pickle(os.path.join(path_load,name))
    return dt

## data

In [None]:
# data
path_load = os.path.join("Data","L2")
path_save = os.path.join("Data","L3")

dt = load_directory_files_dict(path_load)

In [None]:
dt['user_profile'].head()

Unnamed: 0,msisdn,churn,train__age,train__contract,train__internet_service,train__account_start_year,planning_area,train__month_delta,census__med_income,census__avg_income,...,web__starhub__sum_values,web__starhub__mean,web__starhub__absolute_sum_of_changes,web__singtel__absolute_sum_of_changes,"web__starhub__linear_trend__attr_""intercept""","web__singtel__linear_trend__attr_""intercept""",web__singtel__minimum,web__starhub__benford_correlation,web__starhub__minimum,web__singtel__benford_correlation
0,6048764759382,0,44,0,1,2018,TOA PAYOH,25,1500,5089.771035,...,1779.0,296.5,1213.0,2009.0,133.571429,630.857143,184.0,0.473221,79.0,0.057715
1,891319344217,0,56,1,1,2010,TOA PAYOH,117,1500,5089.771035,...,4656.0,776.0,2144.0,2499.0,919.714286,244.857143,140.0,-0.026776,310.0,0.383169
2,99251853671,0,26,0,1,2007,TOA PAYOH,151,1500,5089.771035,...,3502.0,583.666667,2862.0,1306.0,444.52381,293.380952,76.0,0.662261,17.0,-0.282165
3,9795194264183,0,48,0,0,2014,TOA PAYOH,73,1500,5089.771035,...,4214.0,702.333333,1985.0,995.0,947.47619,609.190476,176.0,0.055723,440.0,0.077998
4,5833245602906,0,52,0,0,2018,TOA PAYOH,15,1500,5089.771035,...,3069.0,511.5,1605.0,764.0,237.571429,386.809524,311.0,0.140976,82.0,-0.110172


## target/features seperation

In [None]:
y = dt['user_profile'].pop('churn')
X_columns = dt['user_profile'].columns
X = dt['user_profile']

## sklearn pipeline

In [None]:
# Set up a pipeline with a feature selection preprocessor that
# drop unnessary features
# then uses a RandomForestClassifier to train the model.

dropping_columns = ['msdidn','planning_area']
drop_idx = [idx for idx, i in enumerate(X_columns) if i in dropping_columns]

pipeline = Pipeline([
      ("select", ColumnTransformer([('drop','drop', drop_idx)], remainder='passthrough')),  # filtering for index columns
      ('classification', RandomForestClassifier( 
          class_weight='balanced_subsample', max_depth=4,
                       min_samples_leaf=0.03, n_estimators=500, n_jobs=-1,
                       random_state=0) # model
      )
    ])

pipeline.fit(X, y)

# Export the classifier to a file
joblib.dump(pipeline, os.path.join(path_save, 'RF_churn_model.joblib'))

['Data\\L3\\RF_churn_model.joblib']

### prediction 
with the saved sklearn joblib pipeline

In [None]:
pipeline.predict(X[:1])

array([0])