In [1]:
# Use local repository code (no pip install)


In [2]:

import warnings
warnings.filterwarnings('ignore')
from pysmatch.Matcher import Matcher
import pandas as pd
import numpy as np

In [3]:
path = "misc/loan.csv"
data = pd.read_csv(path)
data

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,loan_status
0,16000,16000,16000.0,36 months,18.85,585.29,D,D3,Fully Paid
1,14000,14000,14000.0,36 months,12.42,467.82,B,B4,Fully Paid
2,7200,7200,7200.0,36 months,8.99,228.93,B,B1,Fully Paid
3,19200,19200,19200.0,36 months,16.99,684.44,D,D1,Fully Paid
4,6000,6000,6000.0,36 months,12.12,199.63,B,B3,Fully Paid
...,...,...,...,...,...,...,...,...,...
21995,20000,20000,20000.0,36 months,13.58,679.48,C,C2,Default
21996,9600,9600,9600.0,36 months,25.69,385.21,F,F1,Default
21997,10000,10000,10000.0,36 months,13.49,339.31,C,C2,Default
21998,4000,4000,4000.0,36 months,18.06,144.74,D,D2,Default


In [4]:
test_data_full = data[data.loan_status == "Default"].copy() # Use .copy() to avoid SettingWithCopyWarning
control_data_full = data[data.loan_status == "Fully Paid"].copy()
treatment_var = 'is_default' 
exclude_cols = ['loan_status']

In [5]:
np.random.seed(20250604)
matcher_instance = Matcher(test=test_data_full,
                           control=control_data_full,
                           yvar=treatment_var, # Matcher will create 'is_default' column internally
                           exclude=exclude_cols)

2026-02-25 15:22:14 - INFO - Treatment column: is_default


2026-02-25 15:22:14 - INFO - Covariates (xvars): ['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate', 'installment', 'grade', 'sub_grade']


2026-02-25 15:22:14 - INFO - N majority group (treatment=0): 20000


2026-02-25 15:22:14 - INFO - N minority group (treatment=1): 2000


In [6]:
# ============ (1) Noraml train (Without optuna) =============
# m.fit_scores(balance=True, nmodels=10, n_jobs=3, model_type='knn')
# m.fit_scores(balance=True, nmodels=10, n_jobs=3, model_type='tree', max_iter=100)
matcher_instance.fit_scores(balance=True, nmodels=10, model_type='tree', n_jobs=2)
# ============ (2) Utilize optuna (Only train one best model) =============
# m.fit_scores(
#     balance=True,
#     model_type='tree',
#     max_iter=200,
#     use_optuna=True,
#     n_trials=15
# )
print(f"Number of models fitted: {len(matcher_instance.models)}")
print(f"Average model accuracy: {np.mean(matcher_instance.model_accuracy):.4f}")

2026-02-25 15:22:14 - INFO - This computer has: 10 cores, using 2 workers for fitting scores.


2026-02-25 15:22:14 - INFO - Fitting 10 model(s) with balance=True for model_type='tree'.


2026-02-25 15:22:16 - INFO - Model 3 (tree) trained. Validation accuracy: 62.91%


2026-02-25 15:22:16 - INFO - Model 1 (tree) trained. Validation accuracy: 65.97%


2026-02-25 15:22:17 - INFO - Model 2 (tree) trained. Validation accuracy: 67.10%


2026-02-25 15:22:17 - INFO - Model 4 (tree) trained. Validation accuracy: 67.02%


2026-02-25 15:22:18 - INFO - Model 5 (tree) trained. Validation accuracy: 61.51%


2026-02-25 15:22:18 - INFO - Model 7 (tree) trained. Validation accuracy: 63.44%


2026-02-25 15:22:19 - INFO - Model 8 (tree) trained. Validation accuracy: 63.57%


2026-02-25 15:22:19 - INFO - Model 6 (tree) trained. Validation accuracy: 64.61%


2026-02-25 15:22:19 - INFO - Model 9 (tree) trained. Validation accuracy: 66.58%


2026-02-25 15:22:20 - INFO - Model 10 (tree) trained. Validation accuracy: 66.76%


2026-02-25 15:22:20 - INFO - Average Accuracy over 10 models: 64.95%


Number of models fitted: 10
Average model accuracy: 0.6495


In [7]:
matcher_instance.predict_scores()
print(matcher_instance.data[['record_id', matcher_instance.treatment_col, 'scores']].head())








2026-02-25 15:22:20 - INFO - Propensity scores predicted and added to 'scores' column in self.data, self.test_df, self.control_df.


   record_id  is_default    scores
0          0           1  0.562931
1          1           1  0.409340
2          2           1  0.394910
3          3           1  0.573037
4          4           1  0.585303


In [8]:
matcher_instance.plot_scores()

In [9]:
matcher_instance.tune_threshold(method='min', nmatches=1, rng=np.arange(0.0001, 0.0051, 0.0005))

In [10]:
matcher_instance.match(threshold=0.001, nmatches=1, method='min', replacement=True, exhaustive_matching=False)
display_cols_standard = ['record_id', matcher_instance.treatment_col, 'scores']
print(matcher_instance.matched_data[display_cols_standard].head())
matcher_instance.plot_matched_scores()
freq_df = matcher_instance.record_frequency()
matcher_instance.assign_weight_vector()
print("top 6 matched data")
print(matcher_instance.matched_data.sort_values("match_id").head(6))

2026-02-25 15:22:20 - INFO - Performing matching using pysmatch.matching.perform_match: method='min', replacement=True, threshold=0.001, nmatches=1


2026-02-25 15:22:20 - INFO - Matching with pysmatch.matching.perform_match complete. Matched data has 3992 rows.




2026-02-25 15:22:21 - INFO - Inverse frequency weights assigned to 'weight' column in matched_data.


   record_id  is_default    scores
0       1945           1  0.235761
1        277           1  0.237670
2       1480           1  0.239759
3       1152           1  0.239759
4       1302           1  0.239778
top 6 matched data
      loan_amnt  funded_amnt  funded_amnt_inv        term  int_rate  \
0          5000         5000           5000.0   36 months      5.32   
1996       5000         5000           5000.0   36 months      5.42   
1          6000         6000           6000.0   36 months      5.32   
1997       6000         6000           6000.0   36 months      6.03   
2         15000        15000          15000.0   36 months      5.32   
1998      15000        15000          15000.0   36 months      5.32   

      installment grade sub_grade loan_status  is_default  record_id  \
0          150.58     A        A1     Default           1       1945   
1996       150.80     A        A1  Fully Paid           0       2760   
1          180.69     A        A1     Default           1

In [11]:
matcher_instance.match(threshold=0.001, nmatches=1, exhaustive_matching=True)
display_cols_exhaustive = ['record_id', matcher_instance.treatment_col, 'scores', 'match_id', 'matched_as', 'pair_score_diff']
actual_display_cols_exhaustive = [col for col in display_cols_exhaustive if col in matcher_instance.matched_data.columns]
print(matcher_instance.matched_data[actual_display_cols_exhaustive].head())
matcher_instance.plot_matched_scores()

2026-02-25 15:22:21 - INFO - Performing exhaustive matching: nmatches=1, threshold=0.001


Exhaustive Matching:   0%|          | 0/2000 [00:00<?, ?it/s]

Exhaustive Matching:   2%|▏         | 37/2000 [00:00<00:05, 365.42it/s]

Exhaustive Matching:   4%|▍         | 86/2000 [00:00<00:04, 435.87it/s]

Exhaustive Matching:   7%|▋         | 140/2000 [00:00<00:03, 480.90it/s]

Exhaustive Matching:  10%|█         | 202/2000 [00:00<00:03, 534.20it/s]

Exhaustive Matching:  13%|█▎        | 265/2000 [00:00<00:03, 568.37it/s]

Exhaustive Matching:  16%|█▋        | 329/2000 [00:00<00:02, 592.57it/s]

Exhaustive Matching:  20%|█▉        | 391/2000 [00:00<00:02, 600.46it/s]

Exhaustive Matching:  23%|██▎       | 457/2000 [00:00<00:02, 618.42it/s]

Exhaustive Matching:  26%|██▌       | 521/2000 [00:00<00:02, 622.04it/s]

Exhaustive Matching:  29%|██▉       | 587/2000 [00:01<00:02, 630.98it/s]

Exhaustive Matching:  33%|███▎      | 653/2000 [00:01<00:02, 636.01it/s]

Exhaustive Matching:  36%|███▌      | 717/2000 [00:01<00:02, 577.85it/s]

Exhaustive Matching:  39%|███▉      | 776/2000 [00:01<00:02, 521.97it/s]

Exhaustive Matching:  42%|████▏     | 830/2000 [00:01<00:02, 512.64it/s]

Exhaustive Matching:  44%|████▍     | 884/2000 [00:01<00:02, 518.63it/s]

Exhaustive Matching:  47%|████▋     | 937/2000 [00:01<00:02, 502.01it/s]

Exhaustive Matching:  50%|████▉     | 994/2000 [00:01<00:01, 519.10it/s]

Exhaustive Matching:  53%|█████▎    | 1052/2000 [00:01<00:01, 533.87it/s]

Exhaustive Matching:  55%|█████▌    | 1108/2000 [00:02<00:01, 527.23it/s]

Exhaustive Matching:  58%|█████▊    | 1162/2000 [00:02<00:01, 480.80it/s]

Exhaustive Matching:  61%|██████    | 1216/2000 [00:02<00:01, 495.98it/s]

Exhaustive Matching:  64%|██████▎   | 1271/2000 [00:02<00:01, 509.73it/s]

Exhaustive Matching:  66%|██████▌   | 1323/2000 [00:02<00:01, 457.21it/s]

Exhaustive Matching:  69%|██████▉   | 1380/2000 [00:02<00:01, 486.33it/s]

Exhaustive Matching:  72%|███████▏  | 1430/2000 [00:02<00:01, 372.60it/s]

Exhaustive Matching:  74%|███████▍  | 1475/2000 [00:02<00:01, 390.28it/s]

Exhaustive Matching:  76%|███████▌  | 1518/2000 [00:03<00:01, 354.64it/s]

Exhaustive Matching:  78%|███████▊  | 1566/2000 [00:03<00:01, 384.06it/s]

Exhaustive Matching:  80%|████████  | 1608/2000 [00:03<00:01, 373.55it/s]

Exhaustive Matching:  83%|████████▎ | 1663/2000 [00:03<00:00, 417.73it/s]

Exhaustive Matching:  86%|████████▌ | 1720/2000 [00:03<00:00, 458.00it/s]

Exhaustive Matching:  89%|████████▉ | 1783/2000 [00:03<00:00, 503.90it/s]

Exhaustive Matching:  92%|█████████▏| 1843/2000 [00:03<00:00, 529.93it/s]

Exhaustive Matching:  95%|█████████▌| 1905/2000 [00:03<00:00, 555.02it/s]

Exhaustive Matching:  98%|█████████▊| 1963/2000 [00:03<00:00, 562.17it/s]

Exhaustive Matching: 100%|██████████| 2000/2000 [00:03<00:00, 502.85it/s]




2026-02-25 15:22:28 - INFO - Exhaustive matching complete. 1996 pairs formed.


   record_id  is_default    scores  match_id matched_as  pair_score_diff
0          0           1  0.562931         0       case         0.000014
1      13058           0  0.562917         0    control         0.000014
2          1           1  0.409340         1       case         0.000000
3       2288           0  0.409340         1    control         0.000000
4          2           1  0.394910         2       case         0.000000


In [12]:
control_usage_freq = matcher_instance.record_frequency()
print(control_usage_freq.head())
matcher_instance.assign_weight_vector()
weight_display_cols = ['record_id', 'match_id', 'weight']
if 'matched_as' in matcher_instance.matched_data.columns:
    weight_display_cols.append('matched_as')
actual_weight_display_cols = [col for col in weight_display_cols if col in matcher_instance.matched_data.columns]
print(matcher_instance.matched_data[actual_weight_display_cols].head())

2026-02-25 15:22:28 - INFO - Inverse frequency weights assigned to 'weight' column in matched_data.


   record_id  n_occurrences_as_control
0      20729                         3
1      12174                         2
2       6295                         2
3      10513                         2
4      18177                         2
   record_id  match_id  weight matched_as
0          0         0     1.0       case
1      13058         0     1.0    control
2          1         1     1.0       case
3       2288         1     1.0    control
4          2         2     1.0       case


In [13]:
continuous_comparison = matcher_instance.compare_continuous(return_table=True, plot_result=True)
print(continuous_comparison)

               var  ks_before  ks_after  grouped_chisqr_before  \
0        loan_amnt        0.0     0.040                    0.0   
1      funded_amnt        0.0     0.042                    0.0   
2  funded_amnt_inv        0.0     0.029                    0.0   
3         int_rate        0.0     0.065                    0.0   
4      installment        0.0     0.130                    0.0   

   grouped_chisqr_after  std_median_diff_before  std_median_diff_after  \
0                 0.066                0.342269              -0.103294   
1                 0.070                0.342440              -0.103373   
2                 0.052                0.342219              -0.103403   
3                 0.040                0.563298              -0.011282   
4                 0.770                0.262456              -0.099793   

   std_mean_diff_before  std_mean_diff_after  
0              0.322839            -0.075302  
1              0.323851            -0.074063  
2              0.

In [14]:
grade_prop_test = matcher_instance.prop_test('grade')
print(grade_prop_test)

{'var': 'grade', 'before': 0.0, 'after': 0.000766}


In [15]:
categorical_comparison = matcher_instance.compare_categorical(return_table=True, plot_result=True)
print(categorical_comparison)

         var  before     after
0       term     0.0  0.002695
1      grade     0.0  0.000766
2  sub_grade     0.0  0.022726
