## Siamese Neural Networks 
### for Supervised Clustering of High Dimensional Spaces

In [1]:
!python -V
!which python

Python 3.8.5
/Users/seanmacrae/mobius/.venv/bin/python


In [2]:
"""Main train job for inspirato"""
# Public PyPi
from datetime import date, datetime
import argparse
import joblib
import logging
import numpy as np
import os
import pandas as pd
import random
import shutil
import timeit
import yaml
import warnings
from jinjasql import JinjaSql
warnings.filterwarnings('ignore')
pd.options.display.max_rows = 50
pd.options.display.max_columns = 999

# Windfall PyPi
import loaderbot.big_query as bq
import loaderbot.cloud_storage as cs
from google.cloud import bigquery, storage
from loaderbot.utils import make_and_clean_up_directory
from roc_it.ml.binary_classification import BinaryClassification
from zoolander.ptg.utils import normalize_to_ptg_range
from zoolander.feature_evaluation import ModelEvaluation
from zoolander.data_processing import windfall_train_test_split
from zoolander.shap import TreeShap

# GCP Clients
bigquery_client = bigquery.Client(project="tranquil-garage-139216")
storage_client = storage.Client(project="tranquil-garage-139216")

In [3]:
LABEL_NAME = "label_lux_goods"
import config.label_lux_goods as config_label_lux_goods
import config.label_education_donor as config_label_education_donor
if LABEL_NAME == "label_lux_goods":
    config = config_label_lux_goods
elif LABEL_NAME == "label_education_donor":
    config = config_label_education_donor
config.params

{'target': 'label_lux_goods',
 'nw_filter': 5.5,
 'ideal_size': 279943,
 'non_ideal_size': 88191706,
 'class_balance': 2}

In [4]:
# !pip install jinjasql

In [5]:
# !pwd

In [6]:
import pandas as pd
from jinjasql import JinjaSql

def data_prep(config, load_from_bq=False, print_sql=False):
    # load sql template
    fd = open('./train.sql', 'r')
    sql_template = fd.read()
    fd.close()
    # populate sql template with params
    j = JinjaSql(param_style='pyformat')
    query, query_params = j.prepare_query(sql_template, config.params)
    sql = query % query_params
    if print_sql:
        print(f"sql query:\n{sql}")  
    label_name = config.params["target"]    
    if load_from_bq:
        # query training data
        raw_data = bq.query_table(
            sql=sql,
            client=bigquery_client
        )
        raw_data.to_csv(f"./data/df_{label_name}.csv")
    else:
        raw_data = pd.read_csv(f"./data/df_{label_name}.csv")
    # exclude variables
    exclude_vars = list(set(raw_data.columns) - set(config.include_vars))
    return raw_data, exclude_vars

raw_data, exclude_vars = data_prep(config)
raw_data.head()

Unnamed: 0.1,Unnamed: 0,windfall_id,logNetWorth,logAccreditedNetWorth,recentPurchaseDate,recentDivorceDate,recentDeathDate,isBoatOwner,isPlaneOwner,recentPhilanthropicGiftDate,recentFoundationAssociationDate,isFoundationOfficer,recentFoundationTrusteeDate,hasFoundationAssociation,hasCharityBoardMember,hasCharityOfficer,isMultiPropertyOwner,isRentalPropertyOwner,isHouseholdDebt,logTotalHouseholdDebt,recentSecTransactionAcquired,recentSecTransactionDisposed,recentPoliticalContributionDate,isSmallBusinessOwner,isCharityBoardMember,isCharityOfficer,primaryPropertyLoanToValue,logPrimaryPropertyAvm,hasTrust,politicalDemocrat,politicalRepublican,politicalIndependent,femaleCount,maleCount,minHouseholdAge,avgHouseholdAge,maxHouseholdAge,metroNameRank,propertyCount,isSFROwner,isCondoOwner,isRuralResidenceOwner,logMaxPropertyAvm,logSumPropertyAvm,maxPropertySqft,maxBedrooms,primaryPropertySqft,primaryPropertyYearsOwned,minPropertyYearsOwned,maxPropertyYearsOwned,logMaxDonationAmount,logSumDonationAmount,isArtsCause,isEducationCause,isEnvironmentalCause,isAnimalCause,isHumanServicesCause,isInternationalCause,isSocialBenefitCause,isReligiousCause,logsumArtsCause,logsumEducationCause,logsumEnvironmentalCause,logsumAnimalCause,logsumHealthCause,logsumHumanServicesCause,logsumInternationalCause,logsumSocialBenefitCause,logsumReligiousCause,isCoopDonation,logsum990Donation,logsumCOOPDonation,logsumFECDonation,logsumStateContribution,logmax990Donation,logmaxCOOPDonation,logmaxFECDonation,logmaxStateContribution,countNumCharities,logMaxDonationAmount_1year,logSumDonationAmount_1year,logsumCOOPDonation_1year,logsumFECDonation_1year,logsumStateContribution_1year,countNumCharities_1year,logMaxDonationAmount_3year,logSumDonationAmount_3year,logsum990Donation_3year,logsumCOOPDonation_3year,logsumFECDonation_3year,logsumStateContribution_3year,countNumCharities_3year,logMaxDonationAmount_5year,logSumDonationAmount_5year,logsum990Donation_5year,logsumCOOPDonation_5year,logsumFECDonation_5year,logsumStateContribution_5year,countNumCharities_5year,num_vehicles,num_luxury_vehicles,num_ultra_luxury_vehicles,num_cars,num_trucks,num_suvs,num_vans,metroRank,stateRank,label_lux_athletic,label_lux_flight,label_lux_travel,label_is990Donation,label_isFECContribution,label_isStateContribution,label_isHealthCause,label_wheelsup,account_id,label_tamara_mellon,label_inspirato,label_healthcare_donor,label_education_donor,label_environment_donor,label_lux,label_private_jet,label_alternative_investment,label_insurance,label_small_business,label
0,0,378b6be3b78619298603b48270d76d1d,5.869838,5.736706,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5.4843,0,0,0,0,0,0,0.609214,5.699531,0,0,0,0,1,1,38.0,38.0,38.0,24,1,1,0,0,5.69953,5.69953,3398.0,4.0,3398.0,16.796715,16.796715,16.796715,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.8748,0.877215,0,0,0,0,0,0,0,,,,,,,,,,,,,0
1,1,edb605945f2e3a9158c755ddc94666f9,5.452112,4.974106,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0.0,5.276464,0,0,0,0,1,1,31.0,49.333333,63.0,100,1,1,0,0,5.276462,5.276462,2312.0,0.0,2312.0,17.519507,17.519507,17.519507,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.790129,0.63108,0,0,0,0,0,0,0,,,,,,,,,,,,,0
2,2,fade072aca072ca2cce01c4649332432,5.654328,5.344198,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5.176091,0,0,0,0,0,0,0.394471,5.580077,0,0,0,0,0,1,65.0,66.0,67.0,4,1,1,0,0,5.580076,5.580076,3054.0,4.0,3054.0,1.462012,1.462012,1.462012,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,5.0,1.0,0.0,1.0,0.0,4.0,0.0,0.711759,0.771516,0,0,0,0,0,0,0,,,,,,,,,,,,,0
3,3,1dfbcc7cf7f73b4384333ef5e95b9ff6,5.457615,5.054238,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,4.700262,0,0,0,0,0,0,0.224209,5.349612,0,0,0,0,1,1,68.0,73.5,79.0,4,1,1,0,0,5.34961,5.34961,1605.0,3.0,1605.0,15.216975,15.216975,15.216975,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,3.0,0.0,0.0,0.0,0.0,3.0,0.0,0.446712,0.567142,0,0,0,0,0,0,0,,,,,,,,,,,,,0
4,4,a14a2e7f836cd4bc6ebb58fcaf58079b,5.99557,5.775634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5.448873,0,0,0,0,0,0,0.41681,5.828936,0,0,0,0,1,1,45.0,45.5,46.0,1,1,1,0,0,5.828935,5.828935,2836.0,4.0,2836.0,11.805613,11.805613,11.805613,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.537568,0.648467,0,0,0,0,0,0,0,,,,,,,,,,,,,0


In [7]:
raw_data.shape

(840098, 128)

In [8]:
from sklearn.model_selection import train_test_split

df = raw_data.sample(20_000)
df_train_raw, df_val_raw = train_test_split(
    df,
    test_size=0.20,
    stratify=df["label"])
df_train_raw.shape, df_val_raw.shape

((16000, 128), (4000, 128))

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler


def preprocess_data(data_train, data_val):
    # prep
    df_train = data_train.drop(columns = ['label'] + exclude_vars)
    df_val = data_val.drop(columns = ['label'] +  exclude_vars)
    features = list(df_train.columns)
    # impute
    imputer = SimpleImputer(strategy = 'median')
    imputer.fit(df_train)
    df_train = imputer.transform(df_train)
    df_val = imputer.transform(df_val)
    # normalize
    scaler = MinMaxScaler(feature_range = (0, 1))
    scaler.fit(df_train)
    df_train_ = scaler.transform(df_train)
    df_val_ = scaler.transform(df_val)
    # prep dfs
    df_train = pd.DataFrame(df_train_, columns=features)
    df_train.loc[:, "label"] = data_train["label"].values
    df_val = pd.DataFrame(df_val_, columns=features)
    df_val.loc[:, "label"] = data_val["label"].values
    print('Training data shape: ', df_train.shape)
    print('Validation data shape: ', df_val.shape)
    return df_train, df_val
df_train, df_val = preprocess_data(df_train_raw, df_val_raw)
df_train.head()

Training data shape:  (16000, 107)
Validation data shape:  (4000, 107)


Unnamed: 0,logNetWorth,logAccreditedNetWorth,recentPurchaseDate,recentDivorceDate,recentDeathDate,isBoatOwner,isPlaneOwner,recentPhilanthropicGiftDate,recentFoundationAssociationDate,isFoundationOfficer,recentFoundationTrusteeDate,hasFoundationAssociation,hasCharityBoardMember,hasCharityOfficer,isMultiPropertyOwner,isRentalPropertyOwner,isHouseholdDebt,logTotalHouseholdDebt,recentSecTransactionAcquired,recentSecTransactionDisposed,recentPoliticalContributionDate,isSmallBusinessOwner,isCharityBoardMember,isCharityOfficer,primaryPropertyLoanToValue,logPrimaryPropertyAvm,hasTrust,politicalDemocrat,politicalRepublican,politicalIndependent,femaleCount,maleCount,minHouseholdAge,avgHouseholdAge,maxHouseholdAge,metroNameRank,propertyCount,isSFROwner,isCondoOwner,isRuralResidenceOwner,logMaxPropertyAvm,logSumPropertyAvm,maxPropertySqft,maxBedrooms,primaryPropertySqft,primaryPropertyYearsOwned,minPropertyYearsOwned,maxPropertyYearsOwned,logMaxDonationAmount,logSumDonationAmount,isArtsCause,isEducationCause,isEnvironmentalCause,isAnimalCause,isHumanServicesCause,isInternationalCause,isSocialBenefitCause,isReligiousCause,logsumArtsCause,logsumEducationCause,logsumEnvironmentalCause,logsumAnimalCause,logsumHealthCause,logsumHumanServicesCause,logsumInternationalCause,logsumSocialBenefitCause,logsumReligiousCause,isCoopDonation,logsum990Donation,logsumCOOPDonation,logsumFECDonation,logsumStateContribution,logmax990Donation,logmaxCOOPDonation,logmaxFECDonation,logmaxStateContribution,countNumCharities,logMaxDonationAmount_1year,logSumDonationAmount_1year,logsumCOOPDonation_1year,logsumFECDonation_1year,logsumStateContribution_1year,countNumCharities_1year,logMaxDonationAmount_3year,logSumDonationAmount_3year,logsum990Donation_3year,logsumCOOPDonation_3year,logsumFECDonation_3year,logsumStateContribution_3year,countNumCharities_3year,logMaxDonationAmount_5year,logSumDonationAmount_5year,logsum990Donation_5year,logsumCOOPDonation_5year,logsumFECDonation_5year,logsumStateContribution_5year,countNumCharities_5year,num_vehicles,num_luxury_vehicles,num_ultra_luxury_vehicles,num_cars,num_trucks,num_suvs,num_vans,metroRank,stateRank,label
0,0.502937,0.484518,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.686557,0.0,0.0,0.0,0.0,0.0,0.0,0.3631,0.664966,0.0,1.0,0.0,0.0,0.666667,0.333333,0.154762,0.410714,0.651163,1.0,0.016949,1.0,0.0,0.0,0.470467,0.63321,0.003415,0.000601,0.018297,0.167839,0.138131,0.175194,0.224619,0.350161,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.037495,0.382492,0.0,0.0,0.0,0.0,0.205067,0.0,1.0,0.0,0.373882,0.1873,0.000623,0.0,0.254737,0.218498,0.016243,0.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146591,0.0,0.0,0.0,0.248646,0.270708,0.0,0.350521,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.284905,0.17725,0
1,0.552694,0.528353,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.718174,0.0,0.0,0.0,0.0,0.0,0.0,0.263553,0.715127,0.0,0.0,0.0,0.0,0.666667,1.0,0.095238,0.359524,0.55814,1.0,0.016949,1.0,0.0,0.0,0.536409,0.678254,0.007171,0.000601,0.038421,0.538548,0.538587,0.520847,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037495,0.0,0.000623,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146591,0.0,0.0,0.0,0.041263,0.039878,0.0,0.144952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.581489,0.576266,0
2,0.543743,0.513809,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.688887,0.0,0.0,0.0,0.0,0.0,0.0,0.208424,0.699467,0.0,0.0,0.0,0.0,1.0,0.666667,0.035714,0.229167,0.406977,0.030303,0.016949,1.0,0.0,0.0,0.515823,0.664191,0.009076,0.000601,0.048629,0.316675,0.316732,0.306266,0.263655,0.244112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.037495,0.267965,0.000623,0.0,0.040243,0.312601,0.016243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146591,0.0,0.0,0.0,0.041263,0.039878,0.0,0.144952,0.0,0.0,0.0,0.428571,0.2,0.0,0.25,0.142857,0.4,0.0,0.653453,0.727846,0
3,0.522701,0.487626,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.651663,0.0,0.0,0.0,0.0,0.0,0.0,0.178906,0.671128,0.0,0.0,0.0,0.0,0.333333,0.333333,0.047619,0.297619,0.418605,1.0,0.016949,1.0,0.0,0.0,0.478568,0.638743,0.004054,0.0,0.021722,0.294952,0.295011,0.285257,0.224619,0.20797,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.037495,0.0,0.0,0.0,0.0,0.0,0.235866,0.0,1.0,0.0,0.237342,0.0,0.000623,0.0,0.254737,0.0,0.016243,0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146591,0.0,0.0,0.0,0.041263,0.039878,0.0,0.144952,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.2,0.333333,0.261436,0.502362,1
4,0.621942,0.598526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.735429,0.0,0.0,0.0,0.0,0.0,0.0,0.114151,0.780863,0.0,0.0,0.0,0.0,0.333333,0.333333,0.595238,0.60119,0.593023,0.141414,0.016949,1.0,0.0,0.0,0.622826,0.737284,0.007332,0.000802,0.039285,0.474887,0.474931,0.459278,0.260754,0.270393,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.037495,0.0,0.358641,0.0,0.290848,0.0,0.0,0.0,1.0,0.0,0.298334,0.0,0.000623,0.0,0.28993,0.0,0.016243,0.1,0.0,0.0,0.0,0.0,0.0,0.0,0.215169,0.203858,0.0,0.320565,0.0,0.0,0.111111,0.241139,0.233048,0.0,0.316982,0.0,0.0,0.083333,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.563129,0.781813,0


In [10]:
from zoolander.data_processing import run_tree_based_feature_selection
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

X_train = run_tree_based_feature_selection(
    X_train = df_train.drop(columns=["label"]),
    y_train = df_train.label,
    model = ExtraTreesClassifier(n_estimators=100),
    max_features= None,
    threshold = None,
) 

df_train_reduced = X_train.copy()
df_train_reduced.loc[:, "label"] = df_train["label"].values
df_val_reduced = df_val.copy()[X_train.columns]
df_val_reduced.loc[:, "label"] = df_val["label"].values
df_train_reduced.head()

Unnamed: 0,logNetWorth,logAccreditedNetWorth,logTotalHouseholdDebt,primaryPropertyLoanToValue,logPrimaryPropertyAvm,femaleCount,maleCount,minHouseholdAge,avgHouseholdAge,maxHouseholdAge,metroNameRank,isCondoOwner,logMaxPropertyAvm,logSumPropertyAvm,maxPropertySqft,maxBedrooms,primaryPropertySqft,primaryPropertyYearsOwned,minPropertyYearsOwned,maxPropertyYearsOwned,logMaxDonationAmount,logSumDonationAmount,isCoopDonation,logsumFECDonation,logmaxCOOPDonation,logmaxFECDonation,logmaxStateContribution,logMaxDonationAmount_1year,logMaxDonationAmount_3year,logSumDonationAmount_3year,logMaxDonationAmount_5year,logSumDonationAmount_5year,logsumFECDonation_5year,metroRank,stateRank,label
0,0.502937,0.484518,0.686557,0.3631,0.664966,0.666667,0.333333,0.154762,0.410714,0.651163,1.0,0.0,0.470467,0.63321,0.003415,0.000601,0.018297,0.167839,0.138131,0.175194,0.224619,0.350161,1.0,0.1873,0.254737,0.218498,0.016243,0.0,0.0,0.0,0.248646,0.270708,0.0,0.284905,0.17725,0
1,0.552694,0.528353,0.718174,0.263553,0.715127,0.666667,1.0,0.095238,0.359524,0.55814,1.0,0.0,0.536409,0.678254,0.007171,0.000601,0.038421,0.538548,0.538587,0.520847,0.0,0.0,0.0,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.581489,0.576266,0
2,0.543743,0.513809,0.688887,0.208424,0.699467,1.0,0.666667,0.035714,0.229167,0.406977,0.030303,0.0,0.515823,0.664191,0.009076,0.000601,0.048629,0.316675,0.316732,0.306266,0.263655,0.244112,0.0,0.267965,0.040243,0.312601,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.653453,0.727846,0
3,0.522701,0.487626,0.651663,0.178906,0.671128,0.333333,0.333333,0.047619,0.297619,0.418605,1.0,0.0,0.478568,0.638743,0.004054,0.0,0.021722,0.294952,0.295011,0.285257,0.224619,0.20797,1.0,0.0,0.254737,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.261436,0.502362,1
4,0.621942,0.598526,0.735429,0.114151,0.780863,0.333333,0.333333,0.595238,0.60119,0.593023,0.141414,0.0,0.622826,0.737284,0.007332,0.000802,0.039285,0.474887,0.474931,0.459278,0.260754,0.270393,1.0,0.0,0.28993,0.0,0.016243,0.0,0.215169,0.203858,0.241139,0.233048,0.0,0.563129,0.781813,0


In [11]:
# fit a model
clf =  RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -2)
clf.fit(df_train_reduced.drop(columns=["label"]), df_train_reduced["label"])
# Extract feature importances
feature_importance_values = clf.feature_importances_
feature_importances = pd.DataFrame({
    'feature': df_train_reduced.drop(columns=["label"]).columns, 
    'importance': feature_importance_values
})
feature_importances

[Parallel(n_jobs=-2)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-2)]: Done 100 out of 100 | elapsed:    0.5s finished


Unnamed: 0,feature,importance
0,logNetWorth,0.088831
1,logAccreditedNetWorth,0.07275
2,logTotalHouseholdDebt,0.024195
3,primaryPropertyLoanToValue,0.018246
4,logPrimaryPropertyAvm,0.091171
5,femaleCount,0.008157
6,maleCount,0.007229
7,minHouseholdAge,0.019145
8,avgHouseholdAge,0.02044
9,maxHouseholdAge,0.019921


In [12]:
from roc_it.ml.binary_classification import BinaryClassification

y_train_scores = pd.Series(clf.predict_proba(df_train_reduced.drop(columns=["label"]))[:, 1])
y_val_scores = pd.Series(clf.predict_proba(df_val_reduced.drop(columns=["label"]))[:, 1])
BinaryClassification(df_train_reduced.label, y_train_scores).save_artifacts(f"./artifacts", "train")
BinaryClassification(df_val_reduced.label, y_val_scores).save_artifacts(f"./artifacts", "test")

[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=11)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=11)]: Using backend ThreadingBackend with 11 concurrent workers.
[Parallel(n_jobs=11)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=11)]: Done 100 out of 100 | elapsed:    0.0s finished


In [13]:
# put the data back together
df = pd.concat([df_train_reduced, df_val_reduced])
df = df.sample(frac=1)
print('Data shape: ', df.shape)
df.head()

Data shape:  (20000, 36)


Unnamed: 0,logNetWorth,logAccreditedNetWorth,logTotalHouseholdDebt,primaryPropertyLoanToValue,logPrimaryPropertyAvm,femaleCount,maleCount,minHouseholdAge,avgHouseholdAge,maxHouseholdAge,metroNameRank,isCondoOwner,logMaxPropertyAvm,logSumPropertyAvm,maxPropertySqft,maxBedrooms,primaryPropertySqft,primaryPropertyYearsOwned,minPropertyYearsOwned,maxPropertyYearsOwned,logMaxDonationAmount,logSumDonationAmount,isCoopDonation,logsumFECDonation,logmaxCOOPDonation,logmaxFECDonation,logmaxStateContribution,logMaxDonationAmount_1year,logMaxDonationAmount_3year,logSumDonationAmount_3year,logMaxDonationAmount_5year,logSumDonationAmount_5year,logsumFECDonation_5year,metroRank,stateRank,label
5533,0.626109,0.597694,0.0,0.0,0.779883,0.333333,0.666667,0.238095,0.4375,0.604651,0.0,0.0,0.621537,0.736404,0.003068,0.0,0.016438,0.167839,0.138131,0.175194,0.062762,0.05811,1.0,0.0,0.078248,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.863502,0.898903,0
9284,0.640302,0.629743,0.0,0.0,0.753421,0.333333,0.0,0.678571,0.678571,0.662791,0.141414,0.0,0.531902,0.678345,0.005777,0.000601,0.030528,0.167839,0.138131,0.175194,0.0,0.0,0.0,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.727306,0.871881,1
9520,0.467629,0.415358,0.0,0.0,0.58335,0.0,0.666667,0.928571,0.928571,0.906977,0.737374,0.0,0.363173,0.559918,0.005777,0.012831,0.030528,0.167839,0.138131,0.175194,0.0,0.0,0.0,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.054016,0.044645,0
10441,0.554413,0.538773,0.747121,0.358441,0.726225,0.333333,0.333333,0.297619,0.39881,0.488372,0.151515,0.0,0.550998,0.688219,0.007344,0.000802,0.03935,0.012849,0.012932,0.012427,0.0,0.0,0.0,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.439246,0.245378,0
3558,0.591558,0.58161,0.776935,0.360522,0.755677,1.0,0.333333,0.488095,0.494048,0.488372,0.565657,0.0,0.589717,0.714667,0.012664,0.001403,0.067856,0.414365,0.414414,0.400745,0.0,0.0,0.0,0.0,0.040243,0.0,0.016243,0.0,0.0,0.0,0.041263,0.039878,0.0,0.662558,0.830729,0


In [14]:
df = raw_data[df.columns].copy()
print('Data shape: ', df.shape)

Data shape:  (840098, 36)


## Tabular Learner

Before we train the Tabular Siamese Learner we will train baseline Tabular Learner for species classification... (why do we do this, exactly? can we just instantiate a Tabular Siamese Learner without a baseline Tabular Learner ???)

Ah yes, to init a new `TabularSiameseModel` we need to provide an `encoder` and `head` and the Tabular Learner will act as the `encoder` we init the `TabularSiameseModel` with.

In [15]:
# from sklearn.model_selection import train_test_split

# df_train, df_val = train_test_split(
#     df,
#     test_size=0.20,
#     stratify=df["label"])

# df_train.shape, df_val.shape

In [16]:
from fastai.tabular.all import CategoryBlock
                                
y_names = ["label"]
y_block = CategoryBlock()

In [17]:
exclude_vars

['label_tamara_mellon',
 'label_wheelsup',
 'windfall_id',
 'label_insurance',
 'label_lux_flight',
 'account_id',
 'label_education_donor',
 'label_lux_travel',
 'label_lux_athletic',
 'label',
 'label_isFECContribution',
 'label_alternative_investment',
 'label_private_jet',
 'label_lux',
 'label_environment_donor',
 'label_isStateContribution',
 'Unnamed: 0',
 'label_small_business',
 'label_healthcare_donor',
 'label_inspirato',
 'label_isHealthCause',
 'label_is990Donation']

In [18]:
from mobius.utils import emb_sz_rule

cat_names = [x for x in df.select_dtypes(exclude=['int', 'float']).columns if x != y_names]
cat_names = [x for x in cat_names if x not in exclude_vars]

# calc embedding sizes for each categorical feature
emb_szs = {k: emb_sz_rule(len(df[k].unique())) for k in cat_names}
emb_szs

{}

In [19]:
import numpy as np

cont_names = [x for x in df.select_dtypes([np.number]).columns if x != y_names]
cont_names = [x for x in cont_names if x not in exclude_vars]
cont_names

['logNetWorth',
 'logAccreditedNetWorth',
 'logTotalHouseholdDebt',
 'primaryPropertyLoanToValue',
 'logPrimaryPropertyAvm',
 'femaleCount',
 'maleCount',
 'minHouseholdAge',
 'avgHouseholdAge',
 'maxHouseholdAge',
 'metroNameRank',
 'isCondoOwner',
 'logMaxPropertyAvm',
 'logSumPropertyAvm',
 'maxPropertySqft',
 'maxBedrooms',
 'primaryPropertySqft',
 'primaryPropertyYearsOwned',
 'minPropertyYearsOwned',
 'maxPropertyYearsOwned',
 'logMaxDonationAmount',
 'logSumDonationAmount',
 'isCoopDonation',
 'logsumFECDonation',
 'logmaxCOOPDonation',
 'logmaxFECDonation',
 'logmaxStateContribution',
 'logMaxDonationAmount_1year',
 'logMaxDonationAmount_3year',
 'logSumDonationAmount_3year',
 'logMaxDonationAmount_5year',
 'logSumDonationAmount_5year',
 'logsumFECDonation_5year',
 'metroRank',
 'stateRank']

In [20]:
from fastai.tabular.all import (Categorify, CategoryBlock, FillMissing, FillStrategy,
                                Normalize, TabDataLoader, TabularPandas,
                                tabular_config, tabular_learner)
# from collections import defaultdict
# from dataclasses import dataclass, field

# @dataclass
# class MyFillMissing(FillMissing):
#     fill_strategy:FillStrategy=FillStrategy.constant
#     add_col:bool=False
#     fill_vals:float=field(default_factory=dict)

# procs = [MyFillMissing, Categorify, Normalize]
procs = [FillMissing, Categorify, Normalize]

In [21]:
from fastai.data.core import range_of
from fastai.tabular.all import RandomSplitter

# train/test split
splits = RandomSplitter(valid_pct=0.20)(range_of(df))

In [22]:
tabular_pandas = TabularPandas(
        df,
        procs=procs,
        cat_names=cat_names,
        cont_names=cont_names,
        y_names=y_names,
        y_block=y_block,
        splits=splits,
        device="cpu")

In [23]:
trn_dl = TabDataLoader(
    tabular_pandas.train,
    bs=256,
    shuffle=True,
    drop_last=True,
    num_workers=4)

val_dl = TabDataLoader(
    tabular_pandas.valid,
    bs=256,
    num_workers=4)

In [24]:
from fastai.data.core import DataLoaders

dls = DataLoaders(trn_dl, val_dl)

print("Sample batch:")
# dls.one_batch()

Sample batch:


In [68]:
# trn_dl.one_batch()[0][0], trn_dl.one_batch()[1][0], trn_dl.one_batch()[2][0]

trn_dl.one_batch()[1][0][:6]

tensor([0.2911, 0.4619, 0.9267, 1.7603, 0.5700, 1.1015])

In [25]:
from fastai.metrics import F1Score, Precision, Recall, accuracy

# load the tabular_pandas data through the tabular_learner
layers = [2048, 1024, 512]

# tabular learner configuration
config = tabular_config(ps=[0.0] * len(layers), embed_p=0.0)

learn = tabular_learner(
    dls,
    layers=layers,
    emb_szs=emb_szs,
    config=config,
    metrics=[accuracy,
             Precision(average='macro'),
             Recall(average='macro'),
             F1Score(average='macro')])

In [50]:
# learn.dls.

In [26]:
# learn.fit_one_cycle(n_epoch=10)

In [27]:
# learn.export("tabular_learn.pkl")

In [28]:
# from mobius.calibration import ModelWithTemperature

# scaled_model = ModelWithTemperature(learn.model)
# scaled_model.set_temperature(val_dl)
# learn.model = scaled_model.model

In [29]:
# # true species labels
# y_true=learn.dls.valid.items["label"]

# # model scores and species predictions
# y_scores, *_ = learn.get_preds(dl=val_dl)
# preds = np.argmax(y_scores, 1).numpy()

In [30]:
# print("First 20 investor labels and predictions")
# list(zip(y_true, preds))[:10]

In [31]:
# (y_true == preds).sum() / len(y_true)

## Siamese Net

To init a new `TabularSiameseDataset` object, we only need a `tabular_pandas` object from the fast.ai library.

In [32]:
from mobius.datasets import write_jsonl

# write SNN training data to `data/`
write_jsonl(tabular_pandas.train.to.items[0].items, "data/train_data.jsonl")
write_jsonl(tabular_pandas.valid.to.items[0].items, "data/valid_data.jsonl")

# write SNN training labels to `data/`
tabular_pandas.train.y.to_csv("data/train_labels.csv", index=True)
tabular_pandas.valid.y.to_csv("data/valid_labels.csv", index=True)

In [33]:
from mobius.datasets import TabularSiameseDataset

train_ds = TabularSiameseDataset(
    csv_file="data/train_labels.csv", 
    jsonl_file="data/train_data.jsonl",
    tabular_learner=learn)
    
valid_ds = TabularSiameseDataset(
    csv_file="data/valid_labels.csv", 
    jsonl_file="data/valid_data.jsonl",
    tabular_learner=learn)

In [34]:
train_ds.__len__(), train_ds.__getitem__(1)

(672079,
 (((tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
    tensor([-0.2272, -0.3782, -1.4536, -0.9967, -0.0328, -0.1084,  1.1006,  2.1340,
             2.2776,  1.7770,  0.6922, -0.3868, -0.3194, -0.3586,  0.0261,  0.0123,
             0.0439, -0.1913, -0.2523, -0.1688,  1.4330,  1.1696, -0.5767,  1.6803,
            -0.5100,  2.0296, -0.4272, -0.4613,  1.1274,  1.0570,  1.7484,  1.4300,
             1.8703,  1.0825,  0.6956])),
   (tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
    tensor([ 0.9119,  0.8415,  0.8185, -0.1930,  0.8636,  2.3114, -0.1239, -1.2571,
            -0.1451,  0.2515, -1.0321, -0.3868,  1.3261,  1.1204, -0.0265,  0.0123,
            -0.0257,  1.1150,  1.2811,  0.9996,  0.4057,  0.7402,  1.7339, -0.5290,
             0.8118, -0.5259, -0.4272, -0.4613, -0.5597, -0.5659, -0.6011, -0.6092,
            -0.4896,  0.8056,  0.8151]))),
  tensor(1.)))

In [35]:
valid_ds.__len__(), valid_ds.__getitem__(0)

(168019,
 (((tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
    tensor([-0.3104, -0.2243,  0.7599,  1.2076,  0.2063, -1.3183, -0.1239, -0.1473,
            -0.1078, -0.0138, -0.9089, -0.3868,  0.1195,  0.0359, -0.0200,  0.0123,
            -0.0171, -1.1439, -1.0102, -1.1767,  0.3052,  0.1579, -0.5767,  0.4793,
            -0.5100,  0.6649, -0.4272,  0.9534,  0.6196,  0.4592,  0.4937,  0.3451,
             0.6148, -0.3383, -0.2270])),
   (tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]),
    tensor([-0.5458, -0.4610,  0.5449,  0.4923, -0.2113, -0.1084,  1.1006,  0.5926,
             0.2276, -0.2127, -0.5148, -0.3868, -0.6471, -0.6531, -0.0387,  0.0123,
            -0.0419, -1.0708, -0.9361, -1.1063,  0.3281,  0.5047, -0.5767,  0.8954,
            -0.5100,  0.6934, -0.4272, -0.4613,  0.6478,  0.8823,  0.5199,  0.7391,
             1.0707,  0.2182,  0.1276]))),
  tensor(0.)))

In [36]:
dls = DataLoaders.from_dsets(train_ds, valid_ds, bs=32, device='cpu', num_workers=4)

Siamese net encoder is the body of the Tabular net we just trained.

In [37]:
dls.dataset.c

2

In [38]:
# dls.

In [None]:
import copy

encoder = copy.copy(learn)
encoder.model.layers = learn.model.layers[:-1]
encoder_model = encoder.model

In [None]:
# from fastai.layers import LinBnDrop

# head = LinBnDrop(n_in=layers[-1]*2,
#     n_out=32,  # size of output space
#     bn=False,
#     act=None)

In [None]:
from mobius.models import TabularSiameseModel

model = TabularSiameseModel(encoder_model)

In [None]:
# model

In [None]:
# batch = dls.one_batch()

In [None]:
from fastai.torch_basics import params
from mobius.losses import ContrastiveLoss

# def siamese_splitter(model):
#     return [params(model.encoder), params(model.head)]

def contrastive_loss_func(out, targ):
    return ContrastiveLoss(margin=0.5)(out, targ.long())

In [None]:
# %load_ext autoreload
# %autoreload 2

In [None]:
# !pip install umap-learn

In [None]:
from fastai.learner import Learner
from mobius.callbacks import TSNECallback
from fastai.callback.tracker import SaveModelCallback
from fastai.callback.training import ShortEpochCallback

# TODO: add callback for best validation
siamese_learner = Learner(dls,
    model,
    model_dir=".",
    loss_func=contrastive_loss_func,
#     splitter=siamese_splitter,
    cbs=[TSNECallback])
#          SaveModelCallback])
#          ShortEpochCallback])

In [None]:
# model(batch[0])

In [None]:
# contrastive_loss_func(model(batch[0]), batch[1])

In [None]:
# !pip install seaborn==0.11.1

In [None]:
# siamese_learner.summary()

In [None]:
# siamese_learner.lr_find()

In [None]:
# !pip list

In [None]:
siamese_learner.fit_one_cycle(n_epoch=10)

In [None]:
# siamese_learner.unfreeze()
# siamese_learner.fit(n_epoch=10, lr=10e-2)

In [None]:
siamese_learner.fit(n_epoch=10, lr=10e-3)

In [None]:
siamese_learner.fit(n_epoch=10, lr=10e-4)

In [None]:
siamese_learner.fit(n_epoch=10, lr=10e-7)

In [None]:
# siamese_learner.unfreeze()
# siamese_learner.fit(n_epoch=3, lr=10e-4)

In [None]:
# tsne = np.load("tsne_1625777058_0.npy")