In [1]:
#Snowpark lib
from snowflake.snowpark import Session

# Data Science Libs
import numpy as np
import pandas as pd

# create_temp_table warning suppresion
import warnings; warnings.simplefilter('ignore')

#ConfigParser to read ini file
import configparser

import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

np.random.seed(0)

config = configparser.ConfigParser()
config.read("/notebooks/notebooks/credentials.ini")

connection_parameters = {
    "user": f'{config["Snowflake"]["user"]}',
    "password": f'{config["Snowflake"]["password"]}',
    "account": f'{config["Snowflake"]["account"]}',
    "WAREHOUSE": f'{config["Snowflake"]["WAREHOUSE"]}',
    "DATABASE": f'{config["Snowflake"]["DATABASE"]}',
    "SCHEMA": f'{config["Snowflake"]["SCHEMA"]}'
}

def snowflake_connector(conn):
    try:
        session = Session.builder.configs(conn).create()
        print("connection successful!")
    except:
        raise ValueError("error while connecting with db")
    return session

session = snowflake_connector(connection_parameters)

connection successful!


In [2]:
# df = session.table("MEMBER_FUNDS_ENRICHED_DETAILS").to_pandas()
df = pd.read_csv("/data/funds.csv")

In [3]:
# df.to_csv("/data/funds.csv", index=False)

In [4]:
df.head()

Unnamed: 0,MEMBER_ID,MEMBER_NAME,MEMBER_EMPLOYMENT,MEMBER_GENDER,MEMBER_CITY_TOWN,MEMBER_STATE,MEMBER_CONTACT_VERIFIED,FUND_ID,CHURN_REASON,LATITUDE,...,MEMBER_AGE,INVESTMENT_AGE_GROUP,RETIREMENT_AGE,RETIREMENT_AGE_GROUP,TOTAL_FUNDS_INVESTED,NO_OF_CHURN,MEMBER_DOB_DT,ALLOCATION_DT,CHURN_DT,RETIREMENT_DT
0,MID108406,Maile Treston,Charles Darwin University,Male,Lakes Entrance,Victoria,Y,FID000044,NOT APPLICABLE,-37.8667,...,52,Senior,13,MORE_THAN_10_YEAR,3,2.0,1972-04-29,2011-02-25,9999-12-31,2037-04-29
1,MID108415,Annabelle Cryer,RMIT University,Female,Mooroopna,Victoria,N,FID000016,NOT APPLICABLE,-36.3833,...,26,Young,39,MORE_THAN_10_YEAR,3,2.0,1998-10-05,2010-01-04,9999-12-31,2063-10-05
2,MID108429,Leonora Acuff,University of South Australia,Male,Jindabyne,New South Wales,N,FID000100,NOT APPLICABLE,-36.4,...,51,Senior,14,MORE_THAN_10_YEAR,3,2.0,1973-12-20,2010-09-04,9999-12-31,2038-12-20
3,MID108436,Franklyn Gesick,The University of Newcastle,Male,Wingham,New South Wales,N,FID000012,NOT APPLICABLE,-31.8667,...,41,Middle-aged,24,MORE_THAN_10_YEAR,3,2.0,1983-03-14,2013-04-27,9999-12-31,2048-03-14
4,MID108438,Amber Gillaspie,University of South Australia,Female,Nathalia,Victoria,Y,FID000035,NOT APPLICABLE,-36.0583,...,40,Middle-aged,25,MORE_THAN_10_YEAR,3,2.0,1984-01-06,2012-05-17,9999-12-31,2049-01-06


In [5]:
df.isna().sum() / df.shape[0]

MEMBER_ID                                           0.000000
MEMBER_NAME                                         0.000000
MEMBER_EMPLOYMENT                                   0.000000
MEMBER_GENDER                                       0.000000
MEMBER_CITY_TOWN                                    0.000000
MEMBER_STATE                                        0.000000
MEMBER_CONTACT_VERIFIED                             0.000000
FUND_ID                                             0.000000
CHURN_REASON                                        0.000000
LATITUDE                                            0.000000
LONGITUDE                                           0.000000
COUNTRY                                             0.000000
COUNTRYCODE                                         0.000000
CAPITAL                                             0.000000
POPULATION                                          0.000000
FUND_TOTAL_ASSETS                                   0.000000
FUND_RETURN_TARGET_PERCE

NO_OF_CHURN column has almost 50% missing value, it wont be wise to impute these many values so will just drop it.                                     

In [6]:
frame = df[['MEMBER_GENDER', 'MEMBER_STATE', 'MEMBER_CONTACT_VERIFIED','FUND_TOTAL_ASSETS','FUND_RETURN_TARGET_PERCENTAGE',
            'INVESTMENT_RISK_CATEGORY', 'CASH_BENCHMARK_ALLOCATION','FIXED_INCOME_BENCHMARK_ALLOCATION',
            'DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
            'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION',
            'INFRA_BENCHMARK_ALLOCATION','COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION','FUND_RISK_LEVEL', 
            'FUND_RISK_CATEGORY','NEG_NETRETURN_SINCE_INCEPTION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS','YEAR_5_RETURNS',
            'YEAR_7_RETURNS','YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',
            'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', 'TOTAL_FUNDS_INVESTED','CHURN_FLAG']].copy()

In [7]:
frame["CHURN_FLAG"] = frame["CHURN_FLAG"].apply(lambda x: 1 if x =="Y" else 0)

In [82]:
# frame.groupby(["RETIREMENT_AGE_GROUP","CHURN_FLAG"])[["CHURN_FLAG"]].count()

The distribution of CHURN is pretty common bettween the employers and hence wont contribute inn modelling

In [76]:
# multiple = {i for i, j in dict(df["MEMBER_ID"].value_counts()).items() if j > 1 }
# multiple[multiple["value_counts"]>=2].index.to_list()

# MODEL TO PREDICT CHURN

In [8]:
numeric_features = ["FUND_TOTAL_ASSETS", 'FUND_RETURN_TARGET_PERCENTAGE','CASH_BENCHMARK_ALLOCATION',
       'FIXED_INCOME_BENCHMARK_ALLOCATION','DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
       'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION', 'INFRA_BENCHMARK_ALLOCATION',
       'COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS',
       'YEAR_5_RETURNS', 'YEAR_7_RETURNS', 'YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["MEMBER_GENDER", "MEMBER_STATE", "MEMBER_CONTACT_VERIFIED","INVESTMENT_RISK_CATEGORY",'FUND_RISK_LEVEL',
                        'FUND_RISK_CATEGORY',"NEG_NETRETURN_SINCE_INCEPTION", 'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', "TOTAL_FUNDS_INVESTED"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [9]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", RandomForestClassifier(n_estimators=500, max_depth=8, 
                                                                                 max_features=0.6,
                                                                                 bootstrap=True, max_samples=0.8))]
)



In [10]:
X = frame.drop("CHURN_FLAG", axis=1)
y = frame["CHURN_FLAG"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [11]:
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

model score: 0.984


In [12]:
import pickle

# save
with open('churn_model.pkl','wb') as f:  ## use rb while reading the fie
    pickle.dump(clf,f)

In [13]:
clf

In [14]:
frame = clf.predict(frame)

In [15]:
df["CHURN_PREDICTED"] = frame

In [16]:
churned = df[df["CHURN_FLAG"] == "Y"]

In [17]:
churned.groupby(["CHURN_REASON"])[["CHURN_REASON"]].count() / churned.shape[0]*100

Unnamed: 0_level_0,CHURN_REASON
CHURN_REASON,Unnamed: 1_level_1
FUND REPUTATION DECLINING,29.070673
HIGH ACCOUNT FEE,16.143163
HIGH TRANSACTION FEE,21.015857
LIFE EVENT,5.547624
NO REASON IDENTIFIED,25.771735
POOR FUND PERFORMANCE,2.450949


In [18]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

In [19]:
frame = churned[['MEMBER_GENDER', 'MEMBER_STATE', 'MEMBER_CONTACT_VERIFIED','FUND_TOTAL_ASSETS','FUND_RETURN_TARGET_PERCENTAGE',
            'INVESTMENT_RISK_CATEGORY', 'CASH_BENCHMARK_ALLOCATION','FIXED_INCOME_BENCHMARK_ALLOCATION',
            'DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
            'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION',
            'INFRA_BENCHMARK_ALLOCATION','COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION','FUND_RISK_LEVEL', 
            'FUND_RISK_CATEGORY','NEG_NETRETURN_SINCE_INCEPTION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS','YEAR_5_RETURNS',
            'YEAR_7_RETURNS','YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',
            'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', 'TOTAL_FUNDS_INVESTED',"CHURN_REASON"]].copy()

In [20]:
churn_reason_dic = {reason:i for i, reason in enumerate(churned["CHURN_REASON"].unique())}
frame["CHURN_REASON"] = frame["CHURN_REASON"].apply(lambda x: churn_reason_dic[x])

In [21]:
frame

Unnamed: 0,MEMBER_GENDER,MEMBER_STATE,MEMBER_CONTACT_VERIFIED,FUND_TOTAL_ASSETS,FUND_RETURN_TARGET_PERCENTAGE,INVESTMENT_RISK_CATEGORY,CASH_BENCHMARK_ALLOCATION,FIXED_INCOME_BENCHMARK_ALLOCATION,DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION,INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION,...,YEAR_3_RETURNS,YEAR_5_RETURNS,YEAR_7_RETURNS,YEAR_10_RETURNS,SUPER_FEES,PENSION_FEES,INVESTMENT_AGE_GROUP,RETIREMENT_AGE_GROUP,TOTAL_FUNDS_INVESTED,CHURN_REASON
2168,Male,Tasmania,Y,130360.8,2.700000,High,2.000000,35.064286,16.535714,27.935714,...,4.55,5.78,6.50,7.46,0.56,0.60,Young,MORE_THAN_10_YEAR,3,0
2169,Female,Tasmania,N,80861.9,4.000000,Very Low,2.000000,14.000000,17.000000,25.000000,...,2.14,1.59,1.77,1.70,0.07,0.06,Middle-aged,MORE_THAN_10_YEAR,3,1
2170,Female,Queensland,Y,30225.2,3.804286,Medium-Low,10.000000,10.000000,10.000000,10.000000,...,4.74,4.57,5.06,5.49,0.51,0.49,Middle-aged,MORE_THAN_10_YEAR,3,2
2171,Female,Tasmania,Y,5866.0,3.247619,Medium-Low,4.047619,9.761905,24.523810,25.190476,...,4.16,4.46,5.05,5.34,0.50,0.46,Middle-aged,MORE_THAN_10_YEAR,3,0
2172,Female,Tasmania,N,10335.9,3.430000,Medium-High,3.720000,11.280000,28.836000,26.280000,...,4.31,4.07,5.21,5.65,0.53,0.44,Senior,IN_NEXT_10_YEAR,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
335912,Male,New South Wales,Y,1807.0,2.730000,Medium-High,7.000000,13.000000,28.000000,27.000000,...,4.26,4.54,5.09,5.58,0.51,0.44,Middle-aged,MORE_THAN_10_YEAR,2,3
335995,Male,Queensland,N,37070.7,4.453636,Medium-High,5.000000,13.000000,27.000000,27.000000,...,4.57,4.66,5.18,5.38,0.50,0.44,Senior,MORE_THAN_10_YEAR,2,3
335996,Male,Queensland,N,2665.3,3.500000,Medium-Low,13.000000,13.000000,17.000000,23.000000,...,4.13,4.80,5.24,5.38,0.50,0.50,Middle-aged,MORE_THAN_10_YEAR,2,0
335997,Male,New South Wales,N,60008.7,3.928571,Medium-High,10.000000,10.000000,10.000000,10.000000,...,4.78,4.03,5.21,5.52,0.52,0.40,Middle-aged,MORE_THAN_10_YEAR,2,3


In [22]:
numeric_features = ["FUND_TOTAL_ASSETS", 'FUND_RETURN_TARGET_PERCENTAGE','CASH_BENCHMARK_ALLOCATION',
       'FIXED_INCOME_BENCHMARK_ALLOCATION','DOMESTIC_LISTED_EQUITY_BENCHMARK_ALLOCATION','INTERNATIONAL_LISTED_EQUITY_BENCHMARK_ALLOCATION',
       'UNLISTED_EQUITY_BENCHMARK_ALLOCATION', 'EQUITY_BENCHMARK_ALLOCATION','PROPERTY_BENCHMARK_ALLOCATION', 'INFRA_BENCHMARK_ALLOCATION',
       'COMMODITIES_BENCHMARK_ALLOCATION', 'OTHERS_BENCHMARK_ALLOCATION', 'YEAR_1_RETURNS', 'YEAR_3_RETURNS',
       'YEAR_5_RETURNS', 'YEAR_7_RETURNS', 'YEAR_10_RETURNS', 'SUPER_FEES','PENSION_FEES',]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["MEMBER_GENDER", "MEMBER_STATE", "MEMBER_CONTACT_VERIFIED","INVESTMENT_RISK_CATEGORY",'FUND_RISK_LEVEL',
                        'FUND_RISK_CATEGORY',"NEG_NETRETURN_SINCE_INCEPTION", 'INVESTMENT_AGE_GROUP', 'RETIREMENT_AGE_GROUP', "TOTAL_FUNDS_INVESTED"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("selector", SelectPercentile(chi2, percentile=50)),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [23]:
multi_lable_clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, max_depth=8, max_features=0.6, bootstrap=True,
                                                             max_samples=0.8))
clf2 = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", multi_lable_clf)]
)




In [24]:
X = frame.drop("CHURN_REASON", axis=1)
y = frame["CHURN_REASON"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
clf2.fit(X_train, y_train)
print("model score: %.3f" % clf2.score(X_test, y_test))

model score: 1.000


In [26]:
import pickle

# save
with open('reason_model.pkl','wb') as f:  ## use rb while reading the fie
    pickle.dump(clf2,f)

# FUND RECOMMENDATION

In [59]:
fund_allocation = session.table("FUND_ALLOCATION_MASTER").to_pandas()
fund_master = session.table("FUND_MASTER").to_pandas()
fund_performance = session.table("FUND_PERFORMANCE_MASTER").to_pandas()

In [60]:
fund = pd.merge(fund_allocation, fund_master, on="FUND_ID")
fund_total = pd.merge(fund, fund_performance, on="FUND_ID")
fund_charge = [for i in fund_total["FUND_FEES_CHARGES"] if i != 'Not Available']
avg_fund_charge = sum(fund_charge) / len(fund_charge)

fund_total["FUND_FEES_CHARGES"] = fund_total["FUND_FEES_CHARGES"].apply(lambda x: avg_fund_charge if x )
fund_total["FUND_FEES_CHARGES"] = fund_total["FUND_FEES_CHARGES"].astype(float) 

ValueError: could not convert string to float: 'Not Available'

In [61]:
fund_total["FUND_FEES_CHARGES"].unique()

array(['545.8148148', '435.6281481', '370.7115385', '309.0740741',
       'Not Available', '626.2857143', '521.4210526', '523.5882353',
       '822.7142857', '408', '415.1166667', '488.8', '559.9428571',
       '543.2142857', '531.9166667', '614.1785714', '614.8571429',
       '614.0714286', '377.7777778', '641.3571429', '472.6785714',
       '687.8', '562.3809524', '400.9132143', '589.7352941',
       '522.5357143', '572.15', '725.9444444', '525.1555556',
       '364.4642857', '498.4285714', '521.0151515', '672.736', '541.36',
       '792.7777778', '555', '514.2857143', '620.8571429', '617.2857143',
       '596.4074074', '572.4642857', '570.7857143', '434.0315789',
       '471.2757143', '603.1928571', '657.9384615', '522.0740741',
       '472.5555556', '661.8', '548.7884615', '348.1481481',
       '491.3214286', '472.4385185', '617.262963', '592.6153846',
       '544.8470588', '571.4166667', '681.8571429', '530', '529.8214286',
       '585.0582143', '670.787037', '1007.64', '618.58333

In [29]:
fund_meta = fund_total[["FUND_ID","FUNDNAME"]]
fund_cluster = fund_total.drop(["FUND_ID","FUNDNAME","FUND_NAME_y","FUND_TRUSTEE","FUND_INCEPTION_DATE"], axis=1)

In [30]:
from sklearn.cluster import DBSCAN

# X = np.array([[1, 2], [2, 2], [2, 3],
#               [8, 7], [8, 8], [25, 80]])
# clustering = DBSCAN(eps=3, min_samples=2).fit(X)
# clustering.labels_

In [31]:
fund_data = pd.get_dummies(fund_cluster)

In [32]:
fund_data = fund_data.replace(False, 0)
fund_data = fund_data.replace(True, 1)

In [33]:
clustering = DBSCAN(eps=2, min_samples=4).fit(fund_data)
# clustering.labels_

In [35]:
fund_total["cluster"] = clustering.labels_

In [54]:
def recommendation(fund_id, reason):
    if reason == 'NO REASON IDENTIFIED' or reason == "NOT APPLICABLE":
        return None
    elif reason == 'HIGH ACCOUNT FEE' or reason == 'HIGH TRANSACTION FEE':
            clus_frame = fund_total[fund_total["FUND_ID"] == fund_id]
            clus = list(clus_frame["cluster"])[0]
            print(clus)
            avg_charge["charge"] = clus_frame[["SUPER_FEES","PENSION_FEES","FUND_FEES_CHARGES"]].apply(lambda x: sum(x[0]+x[1]+x[2])/3, axis=1)
            total_charge = list(avg_charge["charge"])[0]
            group = fund_total[fund_total["cluster"] == clus][["FUND_ID","SUPER_FEES","PENSION_FEES","FUND_FEES_CHARGES"]]
            group["avg_fee"] = group[["SUPER_FEES","PENSION_FEES","FUND_FEES_CHARGES"]].apply(lambda x: (x[0]+x[1]+x[2])/3, axis=1)
            group["rec"] = group["avg_fee"].apply(lambda x: 1 if x <= total_charge else 0)
            recommendations = list(group[group["rec"] == 1]["FUND_ID"])
            recommendations.remove(fund_id)
            
            return recommendations
            
            
            
recommendation("FID000034","HIGH ACCOUNT FEE") 
    

-1


TypeError: unsupported operand type(s) for +: 'float' and 'str'

In [57]:
fund_total[fund_total["cluster"] == 0]["FUND_FEES_CHARGES"]

4    370.7115385
5    370.7115385
6    370.7115385
7    370.7115385
8    370.7115385
Name: FUND_FEES_CHARGES, dtype: object

In [39]:
df["CHURN_REASON"].unique()

array(['NOT APPLICABLE', 'NO REASON IDENTIFIED', 'HIGH ACCOUNT FEE',
       'HIGH TRANSACTION FEE', 'FUND REPUTATION DECLINING', 'LIFE EVENT',
       'POOR FUND PERFORMANCE'], dtype=object)

In [56]:
churned[["FUND_ID","CHURN_REASON",""]]

KeyError: "['FUND_FEES_CHARGES'] not in index"