# Data Cleaning and Exploration

In [1]:
import xpress as xp
import pandas as pd
import numpy as np

In [2]:
customer = pd.read_excel('231013_Customer_Base.xlsx')
fraud = pd.read_excel('231013_Fraud_Cases.xlsx')
transact = pd.read_excel('231013_Transactions_Input.xlsx')

In [3]:
# Merging Transaction dataset with Customer dataset by customer_id
full = transact.merge(customer, how = 'left', on='customer_id')

In [4]:
#### some code used for understanding the data
# transact.groupby('category')["category"].count().sort_values(ascending = False)
# print(full.iloc[:2])

# full[full.transac_prob.isna()].groupby('category')["category"].count().sort_values(ascending = False)
#### From this, we know that some of the transfers have type 'income'. These do not have transaction probabilities.

# full[full.transac_prob.isna()].groupby('In_or_Out')["category"].count().sort_values(ascending = False)
#### More generally, all Transfers, Income and Interest (and only these categories) are classified as paid_in. 
#### So an easy way to subset is to get rid of all paid_in.
#### Also, get rid of cash withdrawals.

In [5]:
# Remove all cash withdrawals, and all "paid_in" data from the dataset
cleaned = full[(full.category != "Cash Withdrawal") & (full.In_or_Out != "paid_in")]

# Modelling 

## Model 1: Using only day one, minimising the expected loss, decision variable is whether to investigate or not (binary), simpified constraints (only bank_from can investigate, no external investigators), international tasks completed within one day (by having multiple people work on it)

In [6]:
# Subsetting to only the first day
dayone = cleaned[cleaned.date == "2023-10-01"]
dayone = dayone.reset_index(drop = True) # reset the index, so the indexing is 0, 1, 2, ... 
# drop = False would have made the old index into a new column in the dataframe. We don't need that, so drop = True.
# Resetting the indexing is needed so that the index of the decision variable corresponds to the index of the dataframe; this is useful for subsequent analysis
dayone

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,bank_E,0.78,0.25,3,bank_E,0.49
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,bank_E,0.30,0.22,1,bank_E,0.49
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,bank_E,0.57,0.53,3,bank_E,0.35
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,bank_E,0.75,0.65,3,bank_E,0.35
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,bank_E,0.37,0.31,1,bank_E,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,2410,Credit Card Payment - American Express **** 5678,175.00,Credit Card Payment,2023-10-01,October,17647,spending,paid_out,Intrnl,bank_D,0.66,0.36,4,bank_D,0.70
1700,2412,Home Improvement - Paint and Supplies,110.75,Home Improvement,2023-10-01,October,17031,spending,paid_out,bank_E,bank_A,0.43,0.46,2,bank_A,0.58
1701,2413,Rent Payment,1200.00,Housing,2023-10-01,October,17699,spending,paid_out,bank_D,bank_A,0.64,0.38,3,bank_A,0.66
1702,2414,Credit Card Payment - Mastercard **** 6789,150.50,Credit Card Payment,2023-10-01,October,17390,spending,paid_out,Intrnl,bank_D,0.66,0.68,4,bank_D,0.45


In [7]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob = xp.problem('fraud')

# Decision Variable
invstg = np.array([xp.var(vartype=xp.binary, name='invstg_{0}_{1}'.format(i+1, j+1))
                    for i in Cases for j in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

print(invstg)

prob.addVariable(invstg)

Using the license file found in your Xpress installation. If you want to use this license and no longer want to see this message, use the following code before using the xpress module:
  xpress.init('C:/xpressmp/bin/xpauth.xpr')
[[invstg_1_1 invstg_1_2 invstg_1_3 invstg_1_4 invstg_1_5]
 [invstg_2_1 invstg_2_2 invstg_2_3 invstg_2_4 invstg_2_5]
 [invstg_3_1 invstg_3_2 invstg_3_3 invstg_3_4 invstg_3_5]
 ...
 [invstg_1702_1 invstg_1702_2 invstg_1702_3 invstg_1702_4 invstg_1702_5]
 [invstg_1703_1 invstg_1703_2 invstg_1703_3 invstg_1703_4 invstg_1703_5]
 [invstg_1704_1 invstg_1704_2 invstg_1704_3 invstg_1704_4 invstg_1704_5]]


### Simplified constraints: no external investigators, only bank_from can investigate

In [8]:
# Constraints

for b in Banks:
    prob.addConstraint(
        sum(invstg[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= teamsize[b])

for i in Cases:
    for b in Banks:
        if dayone["bank_from"][i] != bank_names[b]:
            invstg[i,b] = 0



In [9]:
# Objective function: minimising loss of non-investigated cases

prob.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * (xp.Sum(invstg[i,b] for b in Banks)) for i in Cases), 
                  sense = xp.maximize)

In [10]:
prob.solve()

FICO Xpress v9.2.2, Hyper, solve started 17:34:02, Nov 18, 2023
Heap usage: 2745KB (peak 2825KB, 232KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
         5 rows         8520 cols         8520 elements      8520 entities
Presolved problem has:
         5 rows          358 cols          358 elements       358 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 5905KB (peak 8906KB, 232KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 2.50e-01,  2.00e+00] / [ 1.25e-01,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  6.00e+00]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 1.30e+01,  9.05e+02]
Autoscaling applied standard scaling

Will try to keep branch and bound tree memory usage below 8.6GB
Fixed 273 (of 273) columns in 4 (of 4) subproblems

 *** Solution foun

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [11]:
print(f'The objective function value is {prob.getObjVal()}') 

The objective function value is 36104.670000000006


In [12]:
invstg_df = pd.DataFrame(data = prob.getSolution(invstg), index = Cases, columns = bank_names)

# This dataframe shows which bank investigated which case.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
invstg_df = invstg_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
invstg_df = invstg_df[cols]

# All cases which were actually investigated
invstg_df_yes = invstg_df[invstg_df[bank_names].sum(axis=1) == 1]

# # Uncomment the following code to display ALL invstg_df decision variables:
# invstg_df_disp = invstg_df.style.set_caption('Investigation Decisions').format(precision=0)
# display(invstg_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# invstg_df_yes_disp = invstg_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(invstg_df_yes_disp)

### Naive model success rate, and comparison with Perfect Information case 

In [13]:
# Actual money lost from fraud: 
d1frauds = dayone.loc[dayone["transaction_id"].isin(fraud["transaction_id"])] # find all the rows in 'dayone' which are frauds by matching transaction_id

d1frauds = d1frauds.loc[~d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])] # filter out all the rows which were investigated

# print out all the frauds that were caught:
d1frauds.loc[d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])]
# An empty data frame is printed. I.e. NONE of the actual fraud cases were caught.

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob


In [14]:
money_lost = d1frauds["Amount"].sum(axis=0)
money_lost

# Under perfect information, you would have investigated and found £1418.75 of scams

1418.7500000000002

## Model 2: GOALS:
### - Day one: minimising the expected loss
### - Subsequent days: minimising the actual loss of all previous days
### - Constraints: 50/50 split for all investigations involving two banks, external investigators included, international tasks completed within one day (by having multiple people work on it)
### - Decision variable: Weights on probabilities

## Currently, everything below has the correct constraints, but only minimises expected loss on day one, and has decision variable as binary. Hence, what needs to be done is (1) do the model for subsequent days, and (2) change the decision variable to be based on some updated weights on the probabilities

In [15]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob2 = xp.problem('fraud2')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)



# print(invstg)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

prob2.addVariable(z, ext, shared, solo)


In [16]:
# Constraints
for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob2.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob2.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [17]:
# Objective function: maximising gain of amount investigated

prob2.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * z[i] for i in Cases) - 
                   xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)

prob2.write("problem2 dayone","lp")

prob2.solve()

FICO Xpress v9.2.2, Hyper, solve started 17:34:05, Nov 18, 2023
Heap usage: 14MB (peak 14MB, 5577KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     25565 rows        27264 cols        57166 elements     27264 entities
Presolved problem has:
      2643 rows         4246 cols        10841 elements      4246 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 18MB (peak 36MB, 5577KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.25e-01,  2.00e+00] / [ 6.25e-02,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  2.70e+01]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 2.02e-01,  9.05e+02]
Autoscaling applied standard scaling

Symmetric problem: generators: 12, support set: 72
 Number of orbits: 36, largest orbit: 2
 Row orbits: 24, row support: 48
Will try to keep

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [18]:
print(f'The objective function value is {prob2.getObjVal()}') 

The objective function value is 171803.1344750016


In [19]:
ext_df = pd.DataFrame(data = prob2.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob2.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob2.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob2.getSolution(z), index = Cases)

# This dataframe shows which bank SHARED investigations.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
shared_df = shared_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
shared_df = shared_df[cols]

# All cases which were actually investigated
shared_df_yes = shared_df[shared_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL decision variables:
#shared_df_disp = shared_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(shared_df_disp)

# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# shared_df_yes_disp = shared_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(shared_df_yes_disp)

# This dataframe shows which bank did a SOLO investigation
solo_df = solo_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
solo_df = solo_df[cols]

# All cases which were actually investigated
# solo_df_yes = solo_df[solo_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL solo decision variables:
#solo_df_disp = solo_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(solo_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# solo_df_yes_disp = solo_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(solo_df_yes_disp)

In [20]:
sum(prob2.getSolution(ext))

array([114., 113.,  99.,  96.,  89.])

## Day Two

In [21]:
# Perfect info on day one
dayone["is_scam"] = dayone["transaction_id"].isin(fraud["transaction_id"])
dayone["is_scam"] = dayone["is_scam"].astype(int)

In [22]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob_p_1 = xp.problem('prob_p_1')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
m_1 = 14000
transact_w = xp.var(vartype = xp.continuous, name='transact_w')
cust_w = xp.var(vartype = xp.continuous, name='cust_w')
amount_w = xp.var(vartype = xp.continuous, name='amount_w')
thres = xp.var(vartype = xp.continuous, name='thres')

aux = np.array([xp.var(vartype = xp.integer, name='aux_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)
aux2 = np.array([xp.var(vartype = xp.integer, name='aux2_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)


prob_p_1.addVariable(z, ext, shared, solo, transact_w, cust_w, amount_w, thres, aux, aux2)


In [23]:
# Constraints

prob_sum = transact_w + cust_w + amount_w == 1
invest1 = [thres - dayone["transac_prob"][i] * transact_w - dayone["customer_prob"][i] * cust_w - dayone["Amount"][i] * amount_w <= m_1*(1-aux[i]) for i in Cases]
invest2 = [z[i] <= aux[i] for i in Cases]
invest3 = [dayone["transac_prob"][i] * transact_w + dayone["customer_prob"][i] * cust_w + dayone["Amount"][i] * amount_w - thres <= m_1*(1-aux2[i]) for i in Cases]
invest4= [1 - z[i] <= aux2[i] for i in Cases]

prob_p_1.addConstraint(prob_sum, invest1, invest2, invest3, invest4)

    
for i in Cases:
    if (dayone["transac_prob"][i] <= 0.4) | (dayone["customer_prob"][i] <= 0.4):
        for b in Banks:
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)

for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob_p_1.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob_p_1.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [24]:
prob_p_1.setObjective(xp.Sum(z[i] * dayone["is_scam"][i] * dayone["Amount"][i] for i in Cases) - xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)


In [25]:
prob_p_1.solve()

FICO Xpress v9.2.2, Hyper, solve started 17:34:08, Nov 18, 2023
Heap usage: 22MB (peak 22MB, 14MB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     45597 rows        30676 cols        94240 elements     30672 entities
Presolved problem has:
      3946 rows         2291 cols        15744 elements      2287 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 26MB (peak 48MB, 14MB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.00e-01,  1.40e+04] / [ 2.50e-03,  4.69e+01]
  RHS and bounds [min,max] : [ 1.00e+00,  1.40e+04] / [ 9.53e-03,  9.38e+01]
  Objective      [min,max] : [ 2.21e+01,  2.45e+02] / [ 4.00e+01,  2.45e+02]
Autoscaling applied Curtis-Reid scaling

Symmetric problem: generators: 2, support set: 12
 Number of orbits: 6, largest orbit: 2
 Row orbits: 8, row support: 16
Will try to keep bra

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [26]:
thres = prob_p_1.getSolution(thres)
transact_w = prob_p_1.getSolution(transact_w)
cust_w = prob_p_1.getSolution(cust_w)
amount_w = prob_p_1.getSolution(amount_w)
print(transact_w, ",", cust_w, ",", amount_w, ",", thres)

print(f'The objective function value is {prob_p_1.getObjVal()}')

ext_df = pd.DataFrame(data = prob_p_1.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob_p_1.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob_p_1.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob_p_1.getSolution(z), index = Cases, columns = ["investigated"])

0.21222147622462378 , 0.7877785237753763 , 0.0 , 0.7860114766499344
The objective function value is 369.65


In [27]:
# Investigated cases in day one based on perfect information and calculated probabilities
# test = pd.concat([dayone,z_df], axis = 1)
# test[(test["investigated"] == 1) & (test["is_scam"] == 1)]

In [28]:
# Using probabilities for day 2:
daytwo = cleaned[cleaned.date == "2023-10-02"]
daytwo = daytwo.reset_index(drop = True) 

In [29]:
def calc(row):
    return int(row["transac_prob"] * transact_w + row["customer_prob"] * cust_w + row["Amount"] * amount_w >= thres)

In [30]:
daytwo["investigated"] = daytwo.apply(calc, axis = 1)
daytwo[(daytwo["investigated"] == 1)]

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob,investigated
165,2604,Utilities Payment - Gas and Electric,95.0,Utilities,2023-10-02,October,13834,spending,paid_out,bank_B,bank_B,0.65,0.22,2,bank_B,0.86,1
210,2654,Restaurant Dinner - Date Night,90.0,Dining Out,2023-10-02,October,14951,spending,paid_out,bank_B,bank_A,0.46,0.2,2,bank_A,0.89,1
213,2657,Utilities Payment - Electricity,80.0,Utilities,2023-10-02,October,14951,spending,paid_out,bank_D,bank_A,0.52,0.63,1,bank_A,0.89,1
327,2798,Internet Service Payment - Provider Name,70.0,Utilities,2023-10-02,October,16194,spending,paid_out,bank_C,bank_B,0.53,0.64,1,bank_B,0.88,1
1022,3653,Transfer to Vacation Fund,350.0,Transfers,2023-10-02,October,13834,spending,paid_out,bank_D,bank_B,0.56,0.15,3,bank_B,0.86,1
1034,3667,Utilities Payment - Water and Sewer,60.75,Utilities,2023-10-02,October,11970,spending,paid_out,bank_D,bank_E,0.56,0.3,1,bank_E,0.86,1


# Full model

In [31]:
cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"])
cleaned["is_scam"] = cleaned["is_scam"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["is_scam"].astype(int)


In [39]:
n_days = len(pd.unique(cleaned["date"]))

loop_len = 50
transact_w_mem = np.zeros(shape = loop_len)
cust_w_mem = np.zeros(shape = loop_len)
amount_w_mem = np.zeros(shape = loop_len)
thres_mem = np.zeros(shape = loop_len)
tot_invst = []

# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
Banks = range(n_banks)


for i in range(1, loop_len): # skip day 1
        subset = cleaned[cleaned["date"] < pd.unique(cleaned["date"])[i]].reset_index(drop = True)
        n_daily_cases = np.shape(subset)[0]
        Cases = range(n_daily_cases)

        # Problem set-up
        prob_full = xp.problem('prob_full')
        prob_full.setControl('outputlog', 0) # suppress output

        #### Decision Variables forces the 50:50 split
        z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                        for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

        # no. of external investigators hired by bank b
        ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                        for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

        # indicator variable for when a case is shared
        shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                        for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
        solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                        for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
        m_1 = 14000
        transact_w = xp.var(vartype = xp.continuous, name='transact_w')
        cust_w = xp.var(vartype = xp.continuous, name='cust_w')
        amount_w = xp.var(vartype = xp.continuous, name='amount_w')
        thres = xp.var(vartype = xp.continuous, name='thres')

        aux = np.array([xp.var(vartype = xp.integer, name='aux_{0}'.format(i+1))
                        for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)
        aux2 = np.array([xp.var(vartype = xp.integer, name='aux2_{0}'.format(i+1))
                        for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)


        prob_full.addVariable(z, ext, shared, solo, transact_w, cust_w, amount_w, thres, aux, aux2)

        #### Constraints

        sum_probs = transact_w + cust_w + amount_w == 1

        invst_M = [thres - subset["transac_prob"][i] * transact_w - subset["customer_prob"][i] * cust_w - subset["Amount"][i] * amount_w <= m_1*(1-aux[i]) for i in Cases]
        invst_M2 = [z[i] <= aux[i] for i in Cases]
        invst_M3 = [subset["transac_prob"][i] * transact_w + subset["customer_prob"][i] * cust_w + subset["Amount"][i] * amount_w - thres <= m_1*(1-aux2[i]) for i in Cases]
        invst_M4 = [1 - z[i] <= aux2[i] for i in Cases]

        # Constraints

        prep_shared = [shared[i,b] == 0 for i in Cases for b in Banks if ((subset["transac_prob"][i] <= 0.4) | (subset["customer_prob"][i] <= 0.4))]
        prep_solo = [solo[i,b] == 0 for i in Cases for b in Banks if ((subset["transac_prob"][i] <= 0.4) | (subset["customer_prob"][i] <= 0.4))]
        prep_ext = [ext[i,b] == 0 for i in Cases for b in Banks if ((subset["transac_prob"][i] <= 0.4) | (subset["customer_prob"][i] <= 0.4))]

        not_bank1 = [shared[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_from"][i] != bank_names[b]) & (subset["bank_to"][i] != bank_names[b]))]
        not_bank2 = [solo[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_from"][i] != bank_names[b]) & (subset["bank_to"][i] != bank_names[b]))]
        not_bank3 = [ext[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_from"][i] != bank_names[b]) & (subset["bank_to"][i] != bank_names[b]))]

        bank_intl1 = [shared[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_to"][i] == "Intrnl") & (subset["bank_from"][i] == bank_names[b]))]
        bank_intl2 = [solo[i,b] == z[i] for i in Cases for b in Banks if ((subset["bank_to"][i] == "Intrnl") & (subset["bank_from"][i] == bank_names[b]))]
        bank_intl3 = [ext[i,b] <= solo[i,b] for i in Cases for b in Banks if ((subset["bank_to"][i] == "Intrnl") & (subset["bank_from"][i] == bank_names[b]))]

        bank_solo1 = [shared[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) & (subset["bank_from"][i] == bank_names[b]))]
        bank_solo2 = [solo[i,b] == z[i] for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) & (subset["bank_from"][i] == bank_names[b]))]
        bank_solo3 = [ext[i,b] <= solo[i,b] for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) & (subset["bank_from"][i] == bank_names[b]))]

        bank_share1 = [shared[i,b] == z[i] for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) != (subset["bank_from"][i] == bank_names[b]))]
        bank_share2 = [solo[i,b] == 0 for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) != (subset["bank_from"][i] == bank_names[b]))]
        bank_share3 = [ext[i,b] <= shared[i,b] for i in Cases for b in Banks if ((subset["bank_to"][i] == bank_names[b]) != (subset["bank_from"][i] == bank_names[b]))]

        size = [xp.Sum(shared[i,b] * time[subset["priority"][i]-1]/2 for i in Cases) +
                xp.Sum(solo[i,b] * time[subset["priority"][i]-1] for i in Cases) <= 
                teamsize[b] + xp.Sum(ext[i,b] * time[subset["priority"][i]-1] for i in Cases) for b in Banks]

        prob_full.addConstraint(sum_probs, invst_M, invst_M2, invst_M3, invst_M4, prep_shared, prep_solo, prep_ext,
                                not_bank1, not_bank2, not_bank3, bank_intl1, bank_intl2, bank_intl3,
                                bank_solo1, bank_solo2, bank_solo3, bank_share1, bank_share2, bank_share3, size)

        prob_full.setObjective(xp.Sum(z[i] * subset["is_scam"][i] * subset["Amount"][i] for i in Cases) - xp.Sum(ext[i,b] * ext_cost[subset["priority"][i] - 1] for i in Cases for b in Banks), 
                        sense = xp.maximize)

        prob_full.solve()
        thres = prob_full.getSolution(thres)
        thres_mem[i] = thres
        transact_w = prob_full.getSolution(transact_w)
        transact_w_mem[i] = transact_w
        cust_w = prob_full.getSolution(cust_w)
        cust_w_mem[i] = cust_w
        amount_w = prob_full.getSolution(amount_w)
        amount_w_mem[i] = amount_w

        # Using probabilities for day 2:
        subsettwo = cleaned[cleaned["date"] == pd.unique(cleaned["date"])[i]].reset_index(drop = True)
        subsettwo = subsettwo.reset_index(drop = True) 
        subsettwo["investigated"] = subsettwo.apply(calc, axis = 1)
        tot_invst.append(subsettwo[(subsettwo["investigated"] == 1)]["transaction_id"].tolist())

        # finalinv = {i: subsettwo[(subsettwo["investigated"] == 1)]["transaction_id"].tolist() for i in range(1, loop_len)}

In [40]:
tot_invst

[[2604, 2635, 2654, 2657, 2798, 3653, 3667],
 [3844, 4028, 4066, 4067, 4081, 4094, 4095, 4119, 4128, 4607, 4788, 4978],
 [5467, 5697, 5737, 5739, 5805, 5806, 5876, 6101, 6103],
 [6448, 6449, 6450, 6755, 6783, 6800, 6801, 6840, 6850, 6858, 6859, 7063],
 [7641, 7643, 7955, 8184],
 [8434, 8524, 8552, 8597, 8901, 9073],
 [9345, 9752, 9865],
 [10067, 10068, 10612],
 [11333],
 [12281, 12282, 12739],
 [13437],
 [],
 [14741],
 [15770, 16168],
 [16569, 16571, 17061],
 [17373, 18087],
 [],
 [],
 [],
 [],
 [23137],
 [],
 [],
 [25606, 25608, 25781, 25782],
 [],
 [],
 [],
 [],
 [29967, 29973, 30576],
 [30938, 30941, 30986],
 [33208, 33551, 33564, 33712, 33919],
 [],
 [35436, 35628],
 [37175, 37201],
 [38327],
 [],
 [40378],
 [41121],
 [42934],
 [43644, 43646],
 [44147],
 [45057, 45172],
 [],
 [],
 [],
 [],
 [50228],
 [],
 [51924]]

In [41]:
print(transact_w_mem)
print(amount_w_mem)
print(cust_w_mem)
print(thres_mem)


[0.         0.19148936 0.19230769 0.2        0.2        0.13513514
 0.13513514 0.05555556 0.05555556 0.79932565 0.26666667 0.77846611
 0.80764623 0.77846611 0.75602339 0.11111111 0.44444444 0.6363458
 0.76650795 0.         0.79932565 0.         0.         0.09744516
 0.         0.6666359  0.         0.06048037 0.02360229 0.00651057
 0.10062161 0.02795292 0.02795292 0.0260961  0.32818254 0.32818254
 0.32818254 0.32818254 0.31986119 0.32818254 0.32818254 0.32173536
 0.44941194 0.39678635 0.4280101  0.36695093 0.40684218 0.28126441
 0.38706754 0.37608356]
[0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00
 0.00000000e+00 6.02098313e-05 0.00000000e+00 6.38658262e-05
 5.70475520e-05 6.38658262e-05 6.77993008e-05 0.00000000e+00
 0.00000000e+00 2.80292838e-05 6.59617010e-05 3.06269333e-05
 6.02098313e-05 3.06269333e-05 3.06269333e-05 1.17968144e-05
 3.06269333e-05 4.61517161e-05 3.06269333e-05 3.23242497e-05
 2.95869129e-0