In [23]:
from global_variables import *
from similarity import *

# Load and Clean the Data

In [1]:
%run ./preprocessing.ipynb

E    52268
K    13372
Name: customer_gender, dtype: int64


# Split into train and test sets

In [2]:
trainset_male, valset_male, evalset_male = dataset_splits(df_male)
trainset_male.shape, valset_male.shape, evalset_male.shape

((36587, 42), (5227, 42), (10454, 42))

In [3]:
trainset_female, valset_female, evalset_female = dataset_splits(df_female)
trainset_female.shape, valset_female.shape, evalset_female.shape

((9360, 42), (1337, 42), (2675, 42))

# Self-Supervised Similarity Measure

In [5]:
df_male.columns

Index(['customer_main_branch_x_coord', 'customer_main_branch_y_coord',
       'customer_home_x_coord', 'customer_home_y_coord',
       'customer_income_level', 'customer_age', 'akbank_banking_age',
       '1)RISKSIZ', '2)GECIKME 1-15 GUN', '3)GECIKME 16-29 GUN',
       '4)GECIKME 30-59 GUN', '5)GECIKME 60+ GUN', '6)TAKIP', 'BV_DoW_0',
       'BV_DoW_1', 'BV_DoW_2', 'BV_DoW_3', 'BV_DoW_4', 'BV_DoW_5', 'BV_DoW_6',
       'BV_very_early', 'BV_early', 'BV_later', 'BV_late',
       'branch_mean_distance', 'payment_mean', 'payment_std',
       'payment_monthly_freq', 'trans_average_amount_AKARYAKIT',
       'trans_average_amount_GIDA', 'trans_average_amount_OTHER',
       'trans_average_amount_RESTORAN', 'trans_average_amount_TEKSTÝL',
       'trans_average_monthly_freq_AKARYAKIT',
       'trans_average_monthly_freq_GIDA', 'trans_average_monthly_freq_OTHER',
       'trans_average_monthly_freq_RESTORAN',
       'trans_average_monthly_freq_TEKSTÝL', 'statement_amount_TL_mean',
       'statemen

**Domain knowledge leads us to believe certain features will be better predictors of partnership than others.** 
- branch xy
- home xy
- income
- age
- <font color='red'>bank age</font>
- risk levels
- <font color='red'>bank visit date/time</font>
- bank span
- <font color='red'>avg. spending in category</font>
- <font color='red'>avg. frequency in category</font>
- statement mean/std
- customer education
- <font color='red'>job status</font>

**banking age:** Except for the rare cases when they signed up together, it's not indicitive of similarity (i.e. we assume that the majority of partnerships has one member being longer-established at a bank and convincing the other to join theirs)

**bank visit date/time:**


**Avg. spending/frequency:**


**Job status:** In this day and age there are several combinations of married working dynamics that such column would be convoluted and not a generalizable metric, especially considering the categories provided - _Paid (Special), Self Employment, Paid (Public), Retired, Retired (Paid), Housewife, Unemployed, Retired (Self-Employed), Student, Working Abroad, Children, Undefined, Other_ -

In [6]:
removal_list = np.concatenate([['akbank_banking_age','customer_job_status'], df_male.filter(regex = 'trans|BV').columns], axis = 0)

male_targets = [dataset.drop(removal_list, axis = 1) for dataset in [trainset_male, valset_male, evalset_male]]
female_targets = [dataset.drop(removal_list, axis = 1) for dataset in [trainset_female, valset_female, evalset_female]]

[dataset.shape for dataset in male_targets], [dataset.shape for dataset in female_targets]

([(36587, 19), (5227, 19), (10454, 19)], [(9360, 19), (1337, 19), (2675, 19)])

In [9]:
if __name__ == "__main__":
    autoencoder_male = Autoencoder(10, output_dimension = 19, hidden_layers = 2, activation = tf.nn.relu)
    autoencoder_male.compile(optimizer='adam', loss=tf.losses.MeanSquaredError())
    
    autoencoder_female = Autoencoder(10, output_dimension = 19, hidden_layers = 2, log_progression = False, activation = tf.nn.relu)
    autoencoder_female.compile(optimizer='adam', loss=tf.losses.MeanSquaredError())
    
    autoencoder_male.fit(trainset_male, male_targets[0],
                                       validation_data=(valset_male, male_targets[1]),
                                       epochs=50,
                                       shuffle=True,
                                       callbacks = [early_stopping],
                                       verbose = 0)
    
    save_model(autoencoder_male, "encoding_male")
    save_history(history_male, "history_male")
    
    autoencoder_female.fit(trainset_female, female_targets[0],
                               validation_data=(valset_female, female_targets[1]),
                               epochs=50,
                               shuffle=True,
                               callbacks = [early_stopping],
                               verbose = 0)
    
    save_model(autoencoder_female, "encoding_female")
    save_history(history_female, "history_female")
    
else:
    autoencoder_male = tf.keras.models.load_model(os.path.join(model_path, "encoding_male"), compile=True)
    autoencoder_female = tf.keras.models.load_model(os.path.join(model_path, "encoding_female"), compile=True)
    
autoencoder_male.summary()
autoencoder_female.summary()



INFO:tensorflow:Assets written to: ../models/encoding_male/assets


INFO:tensorflow:Assets written to: ../models/encoding_male/assets


INFO:tensorflow:Assets written to: ../models/encoding_female/assets


INFO:tensorflow:Assets written to: ../models/encoding_female/assets


Model: "autoencoder_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_2 (Encoder)         multiple                  1388      
                                                                 
 decoder (Decoder)           multiple                  439       
                                                                 
Total params: 1,827
Trainable params: 1,827
Non-trainable params: 0
_________________________________________________________________
Model: "autoencoder_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_3 (Encoder)         multiple                  1388      
                                                                 
 decoder (Decoder)           multiple                  439       
                                                                 
Total params: 1,827
Trainabl

In [18]:
print( "Evaluation Loss (Male):\n\t", autoencoder_male.evaluate(evalset_male, male_targets[2], verbose = 0))
print( "Evaluation Loss (Female):\n\t", autoencoder_female.evaluate(evalset_female, female_targets[2], verbose = 0))

Evaluation Loss (Male):
	 1.449664831161499
Evaluation Loss (Female):
	 1.7745821475982666


In [24]:
encoded_males = autoencoder_male.encoder(df_male).numpy()
encoded_females = autoencoder_female.encoder(df_female).numpy()

male_encoded.tofile(os.path.join(data_path,'encoded_males.csv'), sep = ',')
female_encoded.tofile(os.path.join(data_path,'encoded_females.csv'), sep = ',')