In [1]:
#Snowpark lib
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

#ConfigParser to read ini file
import configparser

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

np.random.seed(0)

config = configparser.ConfigParser()
config.read("/notebooks/notebooks/credentials.ini")

connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    "account": f'{config["Snowflake"]["account"]}',
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [40]:
# df = session.table("MEMBER_FUNDS_ENRICHED_DETAILS").to_pandas()
df = pd.read_csv("/data/funds.csv")

In [24]:
# df.to_csv("/data/funds.csv", index=False)

In [41]:
df.head()

Unnamed: 0,MEMBER_ID,MEMBER_NAME,MEMBER_EMPLOYMENT,MEMBER_GENDER,MEMBER_CITY_TOWN,MEMBER_STATE,MEMBER_CONTACT_VERIFIED,FUND_ID,CHURN_REASON,LATITUDE,...,MEMBER_AGE,INVESTMENT_AGE_GROUP,RETIREMENT_AGE,RETIREMENT_AGE_GROUP,TOTAL_FUNDS_INVESTED,NO_OF_CHURN,MEMBER_DOB_DT,ALLOCATION_DT,CHURN_DT,RETIREMENT_DT
0,MID000001,Vallie Bachman,The University of Sydney,Female,Hobart,Tasmania,N,FID000106,NOT APPLICABLE,-42.8806,...,44,Middle-aged,21,MORE_THAN_10_YEAR,1,,1980-01-08,2004-11-18,9999-12-31,2045-01-08
1,MID000028,Jani Herritt,Murdoch University,Female,Sydney,New South Wales,Y,FID000137,NOT APPLICABLE,-33.8678,...,52,Senior,13,MORE_THAN_10_YEAR,1,,1972-02-27,2001-07-08,9999-12-31,2037-02-27
2,MID000044,Jani Rulapaugh,Torrens University Australia,Male,Melbourne,Victoria,Y,FID000050,NOT APPLICABLE,-37.8142,...,43,Middle-aged,22,MORE_THAN_10_YEAR,1,,1981-02-17,2002-12-10,9999-12-31,2046-02-17
3,MID000089,Alease Hollack,University of New England,Male,Darwin,Northern Territory,N,FID000186,NOT APPLICABLE,-12.4381,...,43,Middle-aged,22,MORE_THAN_10_YEAR,1,,1981-12-19,2002-08-13,9999-12-31,2046-12-19
4,MID000098,Devorah Whobrey,The University of Notre Dame Australia,Female,Darwin,Northern Territory,Y,FID000184,NOT APPLICABLE,-12.4381,...,59,Senior,6,IN_NEXT_5_YEAR,1,,1965-12-07,2005-08-05,9999-12-31,2030-12-07


In [42]:
df.isna().sum() / df.shape[0]

MEMBER_ID                                           0.000000
MEMBER_NAME                                         0.000000
MEMBER_EMPLOYMENT                                   0.000000
MEMBER_GENDER                                       0.000000
MEMBER_CITY_TOWN                                    0.000000
MEMBER_STATE                                        0.000000
MEMBER_CONTACT_VERIFIED                             0.000000
FUND_ID                                             0.000000
CHURN_REASON                                        0.000000
LATITUDE                                            0.000000
LONGITUDE                                           0.000000
COUNTRY                                             0.000000
COUNTRYCODE                                         0.000000
CAPITAL                                             0.000000
POPULATION                                          0.000000
FUND_TOTAL_ASSETS                                   0.000000
FUND_RETURN_TARGET_PERCE

NO_OF_CHURN column has almost 50% missing value, it wont be wise to impute these many values so will just drop it.                                     

In [86]:
frame = df[['MEMBER_GENDER', 'MEMBER_STATE', 'MEMBER_CONTACT_VERIFIED','FUND_TOTAL_ASSETS','FUND_RETURN_TARGET_PERCENTAGE',
            'INVESTMENT_RISK_CATEGORY', 'CASH_BENCHMARK_ALLOCATION','FIXED_INCOME_BENCHMARK_ALLOCATION',
            'DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
            'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION',
            'INFRA_BENCHMARK_ALLOCATION','COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION','FUND_RISK_LEVEL', 
            'FUND_RISK_CATEGORY','NEG_NETRETURN_SINCE_INCEPTION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS','YEAR_5_RETURNS',
            'YEAR_7_RETURNS','YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',
            'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', 'TOTAL_FUNDS_INVESTED','CHURN_FLAG']].copy()

In [87]:
frame["CHURN_FLAG"] = frame["CHURN_FLAG"].apply(lambda x: 1 if x =="Y" else 0)

In [82]:
# frame.groupby(["RETIREMENT_AGE_GROUP","CHURN_FLAG"])[["CHURN_FLAG"]].count()

The distribution of CHURN is pretty common bettween the employers and hence wont contribute inn modelling

In [76]:
# multiple = {i for i, j in dict(df["MEMBER_ID"].value_counts()).items() if j > 1 }
# multiple[multiple["value_counts"]>=2].index.to_list()

# MODEL TO PREDICT CHURN

In [94]:
numeric_features = ["FUND_TOTAL_ASSETS", 'FUND_RETURN_TARGET_PERCENTAGE','CASH_BENCHMARK_ALLOCATION',
       'FIXED_INCOME_BENCHMARK_ALLOCATION','DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
       'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION', 'INFRA_BENCHMARK_ALLOCATION',
       'COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS',
       'YEAR_5_RETURNS', 'YEAR_7_RETURNS', 'YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["MEMBER_GENDER", "MEMBER_STATE", "MEMBER_CONTACT_VERIFIED","INVESTMENT_RISK_CATEGORY",'FUND_RISK_LEVEL',
                        'FUND_RISK_CATEGORY',"NEG_NETRETURN_SINCE_INCEPTION", 'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', "TOTAL_FUNDS_INVESTED"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [96]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(n_estimators=500, max_depth=8, 
                                                                                 max_features=0.6,
                                                                                 bootstrap=True, max_samples=0.8))]
)



In [98]:
X = frame.drop("CHURN_FLAG", axis=1)
y = frame["CHURN_FLAG"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [99]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.986


In [101]:
import pickle

# save
with open('churn_model.pkl','wb') as f:  ## use rb while reading the fie
    pickle.dump(clf,f)

In [102]:
clf

In [103]:
frame = clf.predict(frame)

In [105]:
df["CHURN_PREDICTED"] = frame

In [111]:
churned = df[df["CHURN_FLAG"] == "Y"]

In [112]:
churned

Unnamed: 0,MEMBER_ID,MEMBER_NAME,MEMBER_EMPLOYMENT,MEMBER_GENDER,MEMBER_CITY_TOWN,MEMBER_STATE,MEMBER_CONTACT_VERIFIED,FUND_ID,CHURN_REASON,LATITUDE,...,INVESTMENT_AGE_GROUP,RETIREMENT_AGE,RETIREMENT_AGE_GROUP,TOTAL_FUNDS_INVESTED,NO_OF_CHURN,MEMBER_DOB_DT,ALLOCATION_DT,CHURN_DT,RETIREMENT_DT,CHURN_PREDICTED
10,MID000117,Vi Auber,Australian Catholic University,Female,Adelaide,South Australia,N,FID000075,NO REASON IDENTIFIED,-34.9275,...,Middle-aged,28,MORE_THAN_10_YEAR,2,1.0,1987-12-30,2002-08-07,2009-12-04,2052-12-30,1
11,MID000119,Stephane Julia,Southern Cross University,Male,Adelaide,South Australia,N,FID000162,HIGH ACCOUNT FEE,-34.9275,...,Middle-aged,30,MORE_THAN_10_YEAR,2,1.0,1989-03-27,2004-03-13,2007-07-07,2054-03-27,1
12,MID000157,Salena Baltimore,Macquarie University,Male,Melbourne,Victoria,N,FID000006,HIGH TRANSACTION FEE,-37.8142,...,Middle-aged,19,MORE_THAN_10_YEAR,2,1.0,1978-06-26,2005-04-23,2008-06-08,2043-06-26,1
176,MID000002,Jade Strassner,Curtin University,Female,Gladstone,Queensland,N,FID000038,LIFE EVENT,-23.8427,...,Middle-aged,27,MORE_THAN_10_YEAR,2,1.0,1986-11-19,2000-09-09,2010-02-07,2051-11-19,1
177,MID000004,Serina Leto,Griffith University,Female,Paynesville,Victoria,Y,FID000145,HIGH ACCOUNT FEE,-37.9167,...,Middle-aged,25,MORE_THAN_10_YEAR,2,1.0,1984-03-08,2006-02-17,2007-10-10,2049-03-08,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335994,MID204905,Solange Ennaco,La Trobe University,Female,Goonellabah,New South Wales,Y,FID000075,NO REASON IDENTIFIED,-28.8167,...,Senior,8,IN_NEXT_10_YEAR,2,1.0,1967-12-25,2002-08-03,2006-11-19,2032-12-25,1
335995,MID204908,Rosio Degonia,Central Queensland University,Female,North Mackay,Queensland,Y,FID000013,LIFE EVENT,-21.1216,...,Young,37,MORE_THAN_10_YEAR,2,1.0,1996-12-13,2001-07-13,2007-04-22,2061-12-13,1
335996,MID204912,Eladia Maynerich,Bond University,Male,Whyalla,South Australia,N,FID000106,HIGH TRANSACTION FEE,-33.0333,...,Middle-aged,29,MORE_THAN_10_YEAR,2,1.0,1988-08-16,2004-12-01,2007-01-25,2053-08-16,1
335997,MID204913,Leslie Pontoriero,University of Canberra,Female,Walgett,New South Wales,Y,FID000013,LIFE EVENT,-30.0167,...,Middle-aged,21,MORE_THAN_10_YEAR,2,1.0,1980-03-27,2001-07-29,2010-01-06,2045-03-27,1
