# Data Cleaning and Exploration

In [40]:
import xpress as xp
import pandas as pd
import numpy as np
import random
import math

In [3]:
customer = pd.read_excel('231013_Customer_Base.xlsx')
fraud = pd.read_excel('231013_Fraud_Cases.xlsx')
transact = pd.read_excel('231013_Transactions_Input.xlsx')

In [4]:
# Merging Transaction dataset with Customer dataset by customer_id
full = transact.merge(customer, how = 'left', on='customer_id')

In [23]:
#### some code used for understanding the data
# transact.groupby('category')["category"].count().sort_values(ascending = False)
# print(full.iloc[:2])

# full[full.transac_prob.isna()].groupby('category')["category"].count().sort_values(ascending = False)
#### From this, we know that some of the transfers have type 'income'. These do not have transaction probabilities.

# full[full.transac_prob.isna()].groupby('In_or_Out')["category"].count().sort_values(ascending = False)
#### More generally, all Transfers, Income and Interest (and only these categories) are classified as paid_in. 
#### So an easy way to subset is to get rid of all paid_in.
#### Also, get rid of cash withdrawals.

In [5]:
# Remove all cash withdrawals, and all "paid_in" data from the dataset
cleaned = full[(full.category != "Cash Withdrawal") & (full.In_or_Out != "paid_in")]

# Modelling 

## Model 1: Using only day one, minimising the expected loss, decision variable is whether to investigate or not (binary), simpified constraints (only bank_from can investigate, no external investigators), international tasks completed within one day (by having multiple people work on it)

In [8]:
# Subsetting to only the first day
dayone = cleaned[cleaned.date == "2023-10-01"]
dayone = dayone.reset_index(drop = True) # reset the index, so the indexing is 0, 1, 2, ... 
# drop = False would have made the old index into a new column in the dataframe. We don't need that, so drop = True.
# Resetting the indexing is needed so that the index of the decision variable corresponds to the index of the dataframe; this is useful for subsequent analysis
dayone

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,bank_E,0.78,0.25,3,bank_E,0.49
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,bank_E,0.30,0.22,1,bank_E,0.49
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,bank_E,0.57,0.53,3,bank_E,0.35
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,bank_E,0.75,0.65,3,bank_E,0.35
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,bank_E,0.37,0.31,1,bank_E,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,2410,Credit Card Payment - American Express **** 5678,175.00,Credit Card Payment,2023-10-01,October,17647,spending,paid_out,Intrnl,bank_D,0.66,0.36,4,bank_D,0.70
1700,2412,Home Improvement - Paint and Supplies,110.75,Home Improvement,2023-10-01,October,17031,spending,paid_out,bank_E,bank_A,0.43,0.46,2,bank_A,0.58
1701,2413,Rent Payment,1200.00,Housing,2023-10-01,October,17699,spending,paid_out,bank_D,bank_A,0.64,0.38,3,bank_A,0.66
1702,2414,Credit Card Payment - Mastercard **** 6789,150.50,Credit Card Payment,2023-10-01,October,17390,spending,paid_out,Intrnl,bank_D,0.66,0.68,4,bank_D,0.45


In [9]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob = xp.problem('fraud')

# Decision Variable
invstg = np.array([xp.var(vartype=xp.binary, name='invstg_{0}_{1}'.format(i+1, j+1))
                    for i in Cases for j in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

print(invstg)

prob.addVariable(invstg)

Using the license file found in your Xpress installation. If you want to use this license and no longer want to see this message, use the following code before using the xpress module:
  xpress.init('C:/xpressmp/bin/xpauth.xpr')
[[invstg_1_1 invstg_1_2 invstg_1_3 invstg_1_4 invstg_1_5]
 [invstg_2_1 invstg_2_2 invstg_2_3 invstg_2_4 invstg_2_5]
 [invstg_3_1 invstg_3_2 invstg_3_3 invstg_3_4 invstg_3_5]
 ...
 [invstg_1702_1 invstg_1702_2 invstg_1702_3 invstg_1702_4 invstg_1702_5]
 [invstg_1703_1 invstg_1703_2 invstg_1703_3 invstg_1703_4 invstg_1703_5]
 [invstg_1704_1 invstg_1704_2 invstg_1704_3 invstg_1704_4 invstg_1704_5]]


### Simplified constraints: no external investigators, only bank_from can investigate

In [10]:
# Constraints

for b in Banks:
    prob.addConstraint(
        sum(invstg[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= teamsize[b])

for i in Cases:
    for b in Banks:
        if dayone["bank_from"][i] != bank_names[b]:
            invstg[i,b] = 0



In [11]:
# Objective function: minimising loss of non-investigated cases

prob.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * (xp.Sum(invstg[i,b] for b in Banks)) for i in Cases), 
                  sense = xp.maximize)

In [12]:
prob.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:33:35, Nov 20, 2023
Heap usage: 2745KB (peak 2825KB, 232KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
         5 rows         8520 cols         8520 elements      8520 entities
Presolved problem has:
         5 rows          358 cols          358 elements       358 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 5905KB (peak 8906KB, 232KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 2.50e-01,  2.00e+00] / [ 1.25e-01,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  6.00e+00]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 1.30e+01,  9.05e+02]
Autoscaling applied standard scaling

Will try to keep branch and bound tree memory usage below 8.6GB
Fixed 273 (of 273) columns in 4 (of 4) subproblems

 *** Solution foun

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [13]:
print(f'The objective function value is {prob.getObjVal()}') 

The objective function value is 36104.670000000006


In [14]:
invstg_df = pd.DataFrame(data = prob.getSolution(invstg), index = Cases, columns = bank_names)

# This dataframe shows which bank investigated which case.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
invstg_df = invstg_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
invstg_df = invstg_df[cols]

# All cases which were actually investigated
invstg_df_yes = invstg_df[invstg_df[bank_names].sum(axis=1) == 1]

# # Uncomment the following code to display ALL invstg_df decision variables:
# invstg_df_disp = invstg_df.style.set_caption('Investigation Decisions').format(precision=0)
# display(invstg_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# invstg_df_yes_disp = invstg_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(invstg_df_yes_disp)

### Naive model success rate, and comparison with Perfect Information case 

In [15]:
# Actual money lost from fraud: 
d1frauds = dayone.loc[dayone["transaction_id"].isin(fraud["transaction_id"])] # find all the rows in 'dayone' which are frauds by matching transaction_id

d1frauds = d1frauds.loc[~d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])] # filter out all the rows which were investigated

# print out all the frauds that were caught:
d1frauds.loc[d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])]
# An empty data frame is printed. I.e. NONE of the actual fraud cases were caught.

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob


In [16]:
money_lost = d1frauds["Amount"].sum(axis=0)
money_lost

# Under perfect information, you would have investigated and found £1418.75 of scams

1418.7500000000002

## Model 2: GOALS:
### - Day one: minimising the expected loss
### - Subsequent days: minimising the actual loss of all previous days
### - Constraints: 50/50 split for all investigations involving two banks, external investigators included, international tasks completed within one day (by having multiple people work on it)
### - Decision variable: Weights on probabilities

## Currently, everything below has the correct constraints, but only minimises expected loss on day one, and has decision variable as binary. Hence, what needs to be done is (1) do the model for subsequent days, and (2) change the decision variable to be based on some updated weights on the probabilities

In [17]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob2 = xp.problem('fraud2')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)



# print(invstg)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

prob2.addVariable(z, ext, shared, solo)


In [18]:
# Constraints
for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob2.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob2.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [19]:
# Objective function: maximising gain of amount investigated

prob2.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * z[i] for i in Cases) - 
                   xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)

prob2.write("problem2 dayone","lp")

prob2.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:33:41, Nov 20, 2023
Heap usage: 14MB (peak 14MB, 5577KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     25565 rows        27264 cols        57166 elements     27264 entities
Presolved problem has:
      2643 rows         4246 cols        10841 elements      4246 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 18MB (peak 36MB, 5577KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.25e-01,  2.00e+00] / [ 6.25e-02,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  2.70e+01]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 2.02e-01,  9.05e+02]
Autoscaling applied standard scaling

Symmetric problem: generators: 12, support set: 72
 Number of orbits: 36, largest orbit: 2
 Row orbits: 24, row support: 48
Will try to keep

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [20]:
print(f'The objective function value is {prob2.getObjVal()}') 

The objective function value is 171803.1344750016


In [21]:
ext_df = pd.DataFrame(data = prob2.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob2.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob2.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob2.getSolution(z), index = Cases)

# This dataframe shows which bank SHARED investigations.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
shared_df = shared_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
shared_df = shared_df[cols]

# All cases which were actually investigated
shared_df_yes = shared_df[shared_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL decision variables:
#shared_df_disp = shared_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(shared_df_disp)

# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# shared_df_yes_disp = shared_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(shared_df_yes_disp)

# This dataframe shows which bank did a SOLO investigation
solo_df = solo_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
solo_df = solo_df[cols]

# All cases which were actually investigated
# solo_df_yes = solo_df[solo_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL solo decision variables:
#solo_df_disp = solo_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(solo_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# solo_df_yes_disp = solo_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(solo_df_yes_disp)

In [22]:
sum(prob2.getSolution(ext))

array([114., 113.,  99.,  96.,  89.])

## Day Two

In [23]:
# Perfect info on day one
dayone["is_scam"] = dayone["transaction_id"].isin(fraud["transaction_id"])
dayone["is_scam"] = dayone["is_scam"].astype(int)

In [24]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob_p_1 = xp.problem('prob_p_1')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
m_1 = 14000
transact_w = xp.var(vartype = xp.continuous, name='transact_w')
cust_w = xp.var(vartype = xp.continuous, name='cust_w')
amount_w = xp.var(vartype = xp.continuous, name='amount_w')
thres = xp.var(vartype = xp.continuous, name='thres')

aux = np.array([xp.var(vartype = xp.integer, name='aux_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)
aux2 = np.array([xp.var(vartype = xp.integer, name='aux2_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)


prob_p_1.addVariable(z, ext, shared, solo, transact_w, cust_w, amount_w, thres, aux, aux2)


In [25]:
# Constraints

prob_sum = transact_w + cust_w + amount_w == 1
invest1 = [thres - dayone["transac_prob"][i] * transact_w - dayone["customer_prob"][i] * cust_w - dayone["Amount"][i] * amount_w <= m_1*(1-aux[i]) for i in Cases]
invest2 = [z[i] <= aux[i] for i in Cases]
invest3 = [dayone["transac_prob"][i] * transact_w + dayone["customer_prob"][i] * cust_w + dayone["Amount"][i] * amount_w - thres <= m_1*(1-aux2[i]) for i in Cases]
invest4= [1 - z[i] <= aux2[i] for i in Cases]

prob_p_1.addConstraint(prob_sum, invest1, invest2, invest3, invest4)

    
for i in Cases:
    if (dayone["transac_prob"][i] <= 0.4) | (dayone["customer_prob"][i] <= 0.4):
        for b in Banks:
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)

for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob_p_1.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob_p_1.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [26]:
prob_p_1.setObjective(xp.Sum(z[i] * dayone["is_scam"][i] * dayone["Amount"][i] for i in Cases) - xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)


In [27]:
prob_p_1.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:33:45, Nov 20, 2023
Heap usage: 22MB (peak 22MB, 14MB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     45597 rows        30676 cols        94240 elements     30672 entities
Presolved problem has:
      3946 rows         2291 cols        15744 elements      2287 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 26MB (peak 48MB, 14MB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.00e-01,  1.40e+04] / [ 2.50e-03,  4.69e+01]
  RHS and bounds [min,max] : [ 1.00e+00,  1.40e+04] / [ 9.53e-03,  9.38e+01]
  Objective      [min,max] : [ 2.21e+01,  2.45e+02] / [ 4.00e+01,  2.45e+02]
Autoscaling applied Curtis-Reid scaling

Symmetric problem: generators: 2, support set: 12
 Number of orbits: 6, largest orbit: 2
 Row orbits: 8, row support: 16
Will try to keep bra

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [28]:
thres = prob_p_1.getSolution(thres)
transact_w = prob_p_1.getSolution(transact_w)
cust_w = prob_p_1.getSolution(cust_w)
amount_w = prob_p_1.getSolution(amount_w)
print(transact_w, ",", cust_w, ",", amount_w, ",", thres)

print(f'The objective function value is {prob_p_1.getObjVal()}')

ext_df = pd.DataFrame(data = prob_p_1.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob_p_1.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob_p_1.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob_p_1.getSolution(z), index = Cases, columns = ["investigated"])

0.21222147622462378 , 0.7877785237753763 , 0.0 , 0.7860114766499344
The objective function value is 369.65


In [29]:
# Investigated cases in day one based on perfect information and calculated probabilities
# test = pd.concat([dayone,z_df], axis = 1)
# test[(test["investigated"] == 1) & (test["is_scam"] == 1)]

In [30]:
# Using probabilities for day 2:
daytwo = cleaned[cleaned.date == "2023-10-02"]
daytwo = daytwo.reset_index(drop = True) 

In [31]:
def calc(row):
    return int(row["transac_prob"] * transact_w + row["customer_prob"] * cust_w + row["Amount"] * amount_w >= thres)

In [32]:
daytwo["investigated"] = daytwo.apply(calc, axis = 1)
daytwo[(daytwo["investigated"] == 1)]

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob,investigated
165,2604,Utilities Payment - Gas and Electric,95.0,Utilities,2023-10-02,October,13834,spending,paid_out,bank_B,bank_B,0.65,0.22,2,bank_B,0.86,1
210,2654,Restaurant Dinner - Date Night,90.0,Dining Out,2023-10-02,October,14951,spending,paid_out,bank_B,bank_A,0.46,0.2,2,bank_A,0.89,1
213,2657,Utilities Payment - Electricity,80.0,Utilities,2023-10-02,October,14951,spending,paid_out,bank_D,bank_A,0.52,0.63,1,bank_A,0.89,1
327,2798,Internet Service Payment - Provider Name,70.0,Utilities,2023-10-02,October,16194,spending,paid_out,bank_C,bank_B,0.53,0.64,1,bank_B,0.88,1
1022,3653,Transfer to Vacation Fund,350.0,Transfers,2023-10-02,October,13834,spending,paid_out,bank_D,bank_B,0.56,0.15,3,bank_B,0.86,1
1034,3667,Utilities Payment - Water and Sewer,60.75,Utilities,2023-10-02,October,11970,spending,paid_out,bank_D,bank_E,0.56,0.3,1,bank_E,0.86,1


# Full model

In [6]:
cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"]) # Making boolean of True or False if its in the fraud database
cleaned["is_scam"] = cleaned["is_scam"].astype(int) # Converting boolean into 0 and 1

one_hot = pd.get_dummies(cleaned['category']) # Turn the categories into dummies
one_hot = one_hot.astype(int) # Change dummies into integers
full = cleaned.join(one_hot) # Add Dummies to full dataset
full = full.reset_index(drop = True) # Reset index of full dataset

full["day"] = (full["date"] - full["date"][0]).dt.days # "day" indicates which day


#full = full[full["day"] <= 30] # Uncomment to run the code on a subset of the model

diction = {1: 0.25, # To convert priority into time spent investigating
           2: 0.5,
           3: 1,
           4: 2
}

full["time_spent"] = full["priority"] # Temporary
full = full.replace({"time_spent":diction}) # Replace priority with time spent using dictionary
full

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"]) # Making boolean of True or False if its in the fraud database
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["is_scam"].astype(int) # Converting boolean into 0 and 1


Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Loan Payment,Online Shopping,Personal care,Shopping,Streaming Services,Transfers,Transportation,Utilities,day,time_spent
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,1,0,0,0,0,0,0,0,1.00
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,0,0,0,0,0,0,1,0,0.25
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,0,0,0,0,0,0,1.00
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,1,0,0,0,0,0,0,0,1.00
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,1,0,0,0,0,0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,0,0,0,0,0,0,304,0.25
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,0,0,0,0,0,0,304,0.25
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,0,1,0,0,0,0,304,0.25
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,0,0,0,0,0,0,1,0,304,0.25


In [7]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

n_priority = len(time)
Priorities = range(n_priority)

# Index Sets
n_days = len(pd.unique(full["date"]))
Days = range(n_days)
n_banks = 5
Banks = range(n_banks)
n_daily_cases = np.shape(full)[0]
Cases = range(n_daily_cases)

Categories = range(np.shape(one_hot)[1])

# Now we want to find the time taken to investigate any given case, based on their priority and based on whether it is shared.
temp = np.zeros((n_daily_cases, 6))
temp = pd.DataFrame(temp, columns = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E', 'Intrnl']) 

# for index, row in full.iterrows():
#     if (row["bank_from"] == row["bank_to"]):
#         temp[row]

# for i in range(n_daily_cases):
#     if (full["bank_from"][i] == full["bank_to"][i]):
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     elif full["bank_to"][i] == "Intrnl":
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     else:
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]/2
#             temp[full["bank_to"][i]][i] = time[full["priority"][i] - 1]/2
# temp
# full = full.join(temp)
# temp

bank_to_df = temp.reset_index()
bank_to_df.update(full.pivot(columns = 'bank_to', values = 'time_spent')) # Pivot to get the time spent for each bank_to
bank_to_df = bank_to_df.set_index('index').rename_axis(None)
bank_to_df

bank_from_df = temp.reset_index()
bank_from_df.update(full.pivot(columns = 'bank_from', values = 'time_spent')) # Pivot to get the time spent for each bank_from
bank_from_df = bank_from_df.set_index('index').rename_axis(None)

ultimate = bank_to_df/2 + bank_from_df/2 # Halve and add them: this forces the 50:50 split for sharing cases

ultimate.loc[ultimate["Intrnl"] > 0, :] *= 2 # Multiply the time taken by 2 for any bank which investigates alone when bank_to is International

full = full.join(ultimate) # Combine with full dataset
full

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Transportation,Utilities,day,time_spent,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,0,0,1.00,0.500,0.000,0.000,0.000,0.500,0.0
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,1,0,0.25,0.000,0.000,0.125,0.000,0.125,0.0
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,1.00,0.000,0.000,0.000,0.500,0.500,0.0
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,0,0,1.00,0.000,0.000,0.000,0.000,1.000,0.0
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,0.25,0.000,0.125,0.000,0.000,0.125,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125,0.000,0.000,0.125,0.000,0.0
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,304,0.25,0.000,0.125,0.125,0.000,0.000,0.0
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125,0.000,0.000,0.125,0.000,0.0
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,1,0,304,0.25,0.000,0.000,0.000,0.250,0.000,0.0


In [8]:
# Problem set-up
prob_full = xp.problem('prob_full')
prob_full.setControl('outputlog', 0) # suppress output

weight_f = np.array([xp.var(vartype = xp.continuous, name = 'weight_{0}'.format(i+1), lb = -xp.infinity) 
                    for i in Categories], dtype = xp.npvar) # now with negative weights possible. Later defined to be <= 0.9

transact_w_f = xp.var(vartype = xp.continuous, name='transact_w_f') # 0 to 1
cust_w_f = xp.var(vartype = xp.continuous, name='cust_w_f') # 0 to 1
amount_w_f = xp.var(vartype = xp.continuous, name='amount_w_f')

combined = np.array([xp.var(vartype = xp.continuous, name = 'combined_{0}'.format(i+1))
              for i in Cases], dtype = xp.npvar) # 0 to 1

internal = np.array([xp.var(vartype = xp.continuous, name = 'internal_{0}_{1}_{2}'.format(i+1, j+1, k+1))
              for i in Days for j in Banks for k in Priorities], dtype = xp.npvar).reshape(n_days, n_banks, n_priority) # Expected no. of days of internal investigatins
external = np.array([xp.var(vartype = xp.continuous, name = 'external_{0}_{1}_{2}'.format(i+1, j+1, k+1))
              for i in Days for j in Banks for k in Priorities], dtype = xp.npvar).reshape(n_days, n_banks, n_priority) # Expected no. of external investigators of that priority.

prob_full.addVariable(weight_f, cust_w_f, transact_w_f, amount_w_f, combined, internal, external) 

#### Constraints

# Combined weights
combined_cons = [combined[i] == xp.Sum(weight_f[c] * full.iloc[i, 17+c] for c in Categories) + transact_w_f * full["transac_prob"][i] + cust_w_f * full["customer_prob"][i] + amount_w_f * full["Amount"][i] for i in Cases]
probab_invst_cons = [combined[i] <= 1 for i in Cases] # 0 <= Combined weights <= 1

prob_full.addConstraint(combined_cons, probab_invst_cons) 

Using the license file found in your Xpress installation. If you want to use this license and no longer want to see this message, use the following code before using the xpress module:
  xpress.init('C:/xpressmp/bin/xpauth.xpr')


In [9]:
for i in Cases:
    for b in Banks:
        full[bank_names[b]][i] = full[bank_names[b]][i] * combined[i] # Multiplying decision variable with time spent.
full

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * combined[i] # Multiplying decision variable with time spent.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * combined[i] # Multiplying decision variable with time spent.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * combined[i] # Multiplying decision variable with time spent.
A value is trying to be set on a copy of a 

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Transportation,Utilities,day,time_spent,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,0,0,1.00,0.5*combined_1,0.0,0.0,0.0,0.5*combined_1,0.0
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,1,0,0.25,0.0,0.0,0.125*combined_2,0.0,0.125*combined_2,0.0
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,1.00,0.0,0.0,0.0,0.5*combined_3,0.5*combined_3,0.0
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,0,0,1.00,0.0,0.0,0.0,0.0,combined_4,0.0
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,0.25,0.0,0.125*combined_5,0.0,0.0,0.125*combined_5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125*combined_238153,0.0,0.0,0.125*combined_238153,0.0,0.0
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,304,0.25,0.0,0.125*combined_238154,0.125*combined_238154,0.0,0.0,0.0
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125*combined_238155,0.0,0.0,0.125*combined_238155,0.0,0.0
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,1,0,304,0.25,0.0,0.0,0.0,0.25*combined_238156,0.0,0.0


In [10]:
# Find sum by day by bank by priority
temptemptemp = full.groupby(["priority", "day"])

In [11]:
A_df = pd.DataFrame({'bank_A': temptemptemp["bank_A"].sum()}).reset_index()
B_df = pd.DataFrame({'bank_B': temptemptemp["bank_B"].sum()}).reset_index()
C_df = pd.DataFrame({'bank_C': temptemptemp["bank_C"].sum()}).reset_index()
D_df = pd.DataFrame({'bank_D': temptemptemp["bank_D"].sum()}).reset_index()
E_df = pd.DataFrame({'bank_E': temptemptemp["bank_E"].sum()}).reset_index()

A_df["bank_B"] = B_df["bank_B"]
A_df["bank_C"] = C_df["bank_C"]
A_df["bank_D"] = D_df["bank_D"]
A_df["bank_E"] = E_df["bank_E"]
exp_work_df = A_df
exp_work_df

Unnamed: 0,priority,day,bank_A,bank_B,bank_C,bank_D,bank_E
0,1,0,0.125 combined_7 +0.125 combined_8 +0.125 com...,0.125 combined_5 +0.125 combined_24 +0.125 co...,0.125 combined_2 +0.125 combined_13 +0.25 com...,0.125 combined_27 +0.125 combined_28 +0.125 c...,0.125 combined_2 +0.125 combined_5 +0.25 comb...
1,1,1,0.25 combined_1707 +0.125 combined_1711 +0.12...,0.125 combined_1705 +0.25 combined_1712 +0.12...,0.25 combined_1709 +0.125 combined_1713 +0.12...,0.125 combined_1705 +0.125 combined_1711 +0.1...,0.125 combined_1715 +0.125 combined_1718 +0.1...
2,1,2,0.125 combined_2840 +0.25 combined_2841 +0.12...,0.125 combined_2843 +0.125 combined_2845 +0.1...,0.125 combined_2843 +0.125 combined_2852 +0.1...,0.125 combined_2840 +0.25 combined_2842 +0.12...,0.125 combined_2844 +0.125 combined_2845 +0.1...
3,1,3,0.125 combined_3985 +0.125 combined_3987 +0.1...,0.125 combined_3987 +0.125 combined_3990 +0.1...,0.125 combined_3993 +0.125 combined_3994 +0.1...,0.25 combined_3999 +0.125 combined_4001 +0.12...,0.125 combined_3985 +0.125 combined_3990 +0.1...
4,1,4,0.25 combined_4904 +0.125 combined_4907 +0.12...,0.125 combined_4905 +0.125 combined_4907 +0.1...,0.25 combined_4906 +0.125 combined_4910 +0.12...,0.125 combined_4908 +0.125 combined_4909 +0.1...,0.125 combined_4905 +0.125 combined_4909 +0.1...
...,...,...,...,...,...,...,...
1215,4,300,2*combined_234933,2 combined_234773 +2 combined_234854 +2 combi...,2 combined_234643 +2 combined_234787 +2 combi...,2*combined_234947,2 combined_234713 +2 combined_235157
1216,4,301,2*combined_236002,2 combined_235554 +2 combined_235618 +2 combi...,0.0,0.0,2 combined_235449 +2 combined_235874 +2 combi...
1217,4,302,2 combined_236320 +2 combined_236478 +2 combi...,2 combined_236055 +2 combined_236277,2 combined_236212 +2 combined_236548,2 combined_236378 +2 combined_236601,0.0
1218,4,303,2 combined_237360 +2 combined_237387 +2 combi...,2 combined_236853 +2 combined_237208,0.0,2*combined_236823,0.0


In [42]:
# bigarray = np.empty(len(time), n_days, n_banks)
# bigarray[:] = np.nan
# bigarray

# for d in Days:
#     for p in range(len(time)):
#         for b in Banks:
#             bigarray[p,d,b] = full.groupby(["priority", "day"])[bank_names[b]].sum()[p,d]
# bigarray


In [12]:
external[1,1,1] * time[1] + internal[1,1,1]

  internal_2_2_2 +0.5 external_2_2_2

In [13]:
for d in Days:
    for b in Banks:
        for p in Priorities:
            prob_full.addConstraint(exp_work_df.iloc[(p-1+1)*(n_days) + d, b+2] <= internal[d,b,p] + (external[d,b,p] * time[p])) # Time constraint for expected investigation

for d in Days:
    for b in Banks:
        prob_full.addConstraint(xp.Sum(internal[d,b,p] for p in Priorities) == teamsize[b]) # All internal investigation days for each bank tally to their team size.

for c in Categories:
    prob_full.addConstraint(weight_f[c] <= 0.9) # Upper bound on weights as 0.9

In [14]:
# for each day, for each bank, for each priority, find the expected investigation time. Then for each day for each bank, minus off the investigators you have starting from priority 1.
# whatever is remaining is allocated to expected no. of external investigators



# for p in range(len(time)):
#     subset = full[full["priority"] == p+1]
#     for d in Days:
#         for b in Banks:
#             bigarray[d,b,p] = xp.Sum(d[i] * full[bank_names[b]][i] for i in Cases if full["day"][i] = d)



# expected cost of external investigator, summed for each day for each bank

prob_full.setObjective(xp.Sum(external[d,b,p] * ext_cost[p] for d in Days for b in Banks for p in Priorities) + xp.Sum((1-combined[i]) * full["is_scam"][i] * full["Amount"][i] for i in Cases), sense = xp.minimize)

prob_full.solve()


(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [44]:
# dext = np.array([xp.var(vartype = xp.continuous, name = 'dext_{0}'.format(i+1))
#               for i in Cases], dtype = xp.npvar)
# dext_cons = [dext[i] <= d[i] for i in Cases]

# prob_full.addVariable(dext)
# prob_full.addConstraint(dext_cons)

# prob_full.addConstraint(exp_work_df.iloc[(p-1)*(305) + d, b+2] <= internal[d,b,p] + external[d,b,p])

In [15]:
print(f'The objective function value is {prob_full.getObjVal()}')

transact_sol = prob_full.getSolution(transact_w_f)
cust_sol = prob_full.getSolution(cust_w_f)
weight_sol = prob_full.getSolution(weight_f)
amount_sol = prob_full.getSolution(amount_w_f)
print("The weights on the categories are: ", weight_sol)
print("The weights on the transaction probabilities are: ", transact_sol)
print("The weights on the customer probabilities are: ", cust_sol)
print("The weights on the transaction value are: ", amount_sol)

print(prob_full.getSolution(combined))
print(prob_full.getSolution(internal))
print(prob_full.getSolution(external))

The objective function value is 114313.7307303506
The weights on the categories are:  [-0.01595293 -0.016353   -0.01814749 -0.01585081 -0.02177066  0.9
 -0.016353   -0.01647201  0.9        -0.01871962 -0.03957168  0.9
 -0.02398831 -0.01598544  0.9         0.9        -0.0161288  -0.020519
 -0.01646832 -0.00377622]
The weights on the transaction probabilities are:  0.007167777446210277
The weights on the customer probabilities are:  0.07521355923675928
The weights on the transaction value are:  1.739449808859619e-05
[2.99387920e-02 3.62724297e-02 1.69304500e-02 ... 9.27344519e-01
 1.25670155e-02 4.24848297e-04]
[[[3.12564031 2.38657556 2.48778414 0.        ]
  [3.56071254 1.0382375  3.60638391 3.79466605]
  [4.67658997 0.69980906 4.57361107 0.0499899 ]
  [3.30800423 1.39390621 4.43200696 0.8660826 ]
  [3.36222626 1.3143402  5.14837771 0.17505583]]

 [[2.782071   4.0870727  0.95918188 0.17167442]
  [3.47432613 1.39208967 0.87155292 6.26203128]
  [2.30003755 1.25965765 1.6614528  4.778852 

# Next day

In [16]:
customer2 = pd.read_excel('231114_Customer_Base_2nd_batch.xlsx')
fraud2 = pd.read_excel('231114_Fraud_Cases_2nd_batch.xlsx')
transact2 = pd.read_excel('231114_Transactions_input_2nd_batch.xlsx')
# Merging Transaction dataset with Customer dataset by customer_id
full2 = transact2.merge(customer2, how = 'left', on='customer_id')
# Remove all cash withdrawals, and all "paid_in" data from the dataset
cleaned2 = full2[(full2.category != "Cash Withdrawal") & (full2.In_or_Out != "paid_in")]
cleaned2["is_scam"] = cleaned2["transaction_id"].isin(fraud2["transaction_id"])
cleaned2["is_scam"] = cleaned2["is_scam"].astype(int)

one_hot2 = pd.get_dummies(cleaned2['category'])
one_hot2 = one_hot2.astype(int)
full2 = cleaned2.join(one_hot2)
full2 = full2.reset_index(drop = True)

full2["day"] = (full2["date"] - full2["date"][0]).dt.days

full2 = full2[full2["day"] == 0]

diction = {1: 0.25,
           2: 0.5,
           3: 1,
           4: 2
}

n_categories2 = np.shape(one_hot2)[1]
Categories2 = range(n_categories2)

full2["time_spent"] = full2["priority"]
full2 = full2.replace({"time_spent":diction})
full2

n_daily_cases2 = np.shape(full2)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned2["is_scam"] = cleaned2["transaction_id"].isin(fraud2["transaction_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned2["is_scam"] = cleaned2["is_scam"].astype(int)


In [17]:
weight_sol_proc = np.delete(weight_sol, [-4], None)

# new_combined = np.zeros(n_daily_cases2)
#for i in range(n_daily_cases2):
#    new_combined[i] = cust_sol * full2["customer_prob"][i] + transact_sol * full2["transac_prob"][i] + amount_sol * full2["Amount"] + sum(weight_sol_proc[c] * full2.iloc[i, 17+c] for c in Categories2)
#new_combined

vec = [amount_sol, transact_sol, cust_sol]
vec_mult_frame = full2.iloc[:,[2,11,15]] * vec
weight_sol_proc_mult_frame = full2.iloc[:,17:36] * weight_sol_proc
joined_mult_frame = vec_mult_frame.join(weight_sol_proc_mult_frame)

combined_day2 = joined_mult_frame.sum(axis = 1)
combined_day2

# np.set_printoptions(threshold=np.inf)

0       0.035531
1       0.024702
2       0.019705
3       0.024887
4       0.024166
          ...   
1758    0.921813
1759    0.019871
1760    0.004764
1761    0.009338
1762    0.018733
Length: 1763, dtype: float64

In [18]:
pd.set_option('display.max_columns', None)
full2

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to_x,bank_from,transac_prob,description_prob,priority,bank_to_y,customer_prob,is_scam,Bank Fees,Charity,Credit Card Payment,Dining Out,Electronics,Entertainment,Groceries,Healthcare,Holiday,Home Improvement,Housing,Investment,Loan Payment,Online Shopping,Personal care,Shopping,Transfers,Transportation,Utilities,day,time_spent
0,315183,Utilities Payment - Gas and Electric,85.30,Utilities,2024-08-01,August,10003,spending,paid_out,bank_D,bank_A,0.45,0.35,2,bank_A,0.46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.50
1,315184,Rent payment,1500.00,Housing,2024-08-01,August,10003,spending,paid_out,bank_E,bank_A,0.50,0.64,3,bank_A,0.46,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00
2,315186,Grocery Shopping - Fresh Mart,52.93,Groceries,2024-08-01,August,10025,spending,paid_out,bank_A,bank_A,0.39,0.61,1,bank_A,0.43,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.25
3,315187,Transfer to Investment Account,454.35,Transfers,2024-08-01,August,10025,spending,paid_out,bank_D,bank_A,0.72,0.23,3,bank_A,0.43,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.00
4,315189,Rent payment,1500.00,Housing,2024-08-01,August,10025,spending,paid_out,bank_A,bank_A,0.74,0.11,3,bank_A,0.43,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,317648,Asos - Men's Clothing,41.16,Shopping,2024-08-01,August,19949,spending,paid_out,bank_B,bank_A,0.32,0.16,1,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.25
1759,317650,Mobile Phone Bill - Service Provider,51.84,Utilities,2024-08-01,August,19949,spending,paid_out,bank_E,bank_A,0.55,0.16,1,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.25
1760,317651,Rent Payment - Apt #203,1200.00,Housing,2024-08-01,August,19949,spending,paid_out,bank_E,bank_A,0.65,0.11,3,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00
1761,317653,Rent payment,1500.00,Housing,2024-08-01,August,19951,spending,paid_out,bank_E,bank_C,0.77,0.59,3,bank_C,0.23,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00


In [19]:
full2["day"] = (full2["date"] - full2["date"][0]).dt.days
print(full2.iloc[0,[2,11,15]])
full2.iloc[0,17:36] # 19 categories, so index 17 until 35 (36 is not included)

Amount           85.3
transac_prob     0.45
customer_prob    0.46
Name: 0, dtype: object


Bank Fees              0
Charity                0
Credit Card Payment    0
Dining Out             0
Electronics            0
Entertainment          0
Groceries              0
Healthcare             0
Holiday                0
Home Improvement       0
Housing                0
Investment             0
Loan Payment           0
Online Shopping        0
Personal care          0
Shopping               0
Transfers              0
Transportation         0
Utilities              1
Name: 0, dtype: object

In [20]:
one_hot

Unnamed: 0,Bank Fees,Charity,Credit Card Payment,Dining Out,Electronics,Entertainment,Groceries,Healthcare,Holiday,Home Improvement,Housing,Investment,Loan Payment,Online Shopping,Personal care,Shopping,Streaming Services,Transfers,Transportation,Utilities
1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
315176,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
315177,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
315178,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
315179,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [21]:
combined_day2.loc[combined_day2 > 1] = 1
combined_day2.loc[combined_day2 < 0] = 0
combined_day2

0       0.035531
1       0.024702
2       0.019705
3       0.024887
4       0.024166
          ...   
1758    0.921813
1759    0.019871
1760    0.004764
1761    0.009338
1762    0.018733
Length: 1763, dtype: float64

In [23]:
# sample from combined_day2
random.seed(4)
sample = np.random.binomial(1,combined_day2)
to_investigate = [i for i in range(len(sample)) if sample[i]==1]
full2['to_investigate'] = sample
# full2.loc[full2["to_investigate"] == 1] # Uncomment to see all cases investigated

#Scams detected:
full2[(full2["to_investigate"] == 1) & (full2["is_scam"] == 1)]

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to_x,bank_from,transac_prob,description_prob,priority,bank_to_y,customer_prob,is_scam,Bank Fees,Charity,Credit Card Payment,Dining Out,Electronics,Entertainment,Groceries,Healthcare,Holiday,Home Improvement,Housing,Investment,Loan Payment,Online Shopping,Personal care,Shopping,Transfers,Transportation,Utilities,day,time_spent,to_investigate
288,315587,Ski Chalet Rental - Alpine Vacation,7500.0,Holiday,2024-08-01,August,11226,spending,paid_out,bank_A,bank_B,0.38,0.62,3,bank_B,0.67,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1.0,1
428,315784,Invoice Settlement - Payment Account Amendment,1083.8,Shopping,2024-08-01,August,11970,spending,paid_out,bank_C,bank_E,0.56,0.8,3,bank_E,0.86,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1.0,1
566,315977,Invoice Payment Change - Supplier Details Update,1182.2,Shopping,2024-08-01,August,12689,spending,paid_out,bank_C,bank_B,0.64,0.84,3,bank_B,0.54,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1.0,1
863,316383,V&I visa check fees,138.8,Utilities,2024-08-01,August,14338,spending,paid_out,bank_E,bank_E,0.41,0.48,2,bank_E,0.89,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.5,1
991,316560,ClothesOnline additional posting payment,32.7,Shopping,2024-08-01,August,15075,spending,paid_out,bank_E,bank_E,0.24,0.52,1,bank_E,0.7,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.25,1
1196,316853,Invoice Payment Change - Supplier Details Update,1587.85,Shopping,2024-08-01,August,16194,spending,paid_out,bank_E,bank_B,0.61,0.7,3,bank_B,0.88,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1.0,1
1259,316941,Invoice Payment Revision - Vendor Account,163.9,Shopping,2024-08-01,August,16615,spending,paid_out,bank_A,bank_C,0.68,0.8,3,bank_C,0.81,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1.0,1
1619,317451,Sport event tickets,24.45,Entertainment,2024-08-01,August,18769,spending,paid_out,bank_B,bank_E,0.3,0.84,1,bank_E,0.8,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.25,1


In [55]:
full2.loc[(full2["Amount"] <= 200) & (full2["to_investigate"] == 1) & (full2["priority"] == 4)]

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to_x,bank_from,transac_prob,description_prob,priority,bank_to_y,customer_prob,is_scam,Bank Fees,Charity,Credit Card Payment,Dining Out,Electronics,Entertainment,Groceries,Healthcare,Holiday,Home Improvement,Housing,Investment,Loan Payment,Online Shopping,Personal care,Shopping,Transfers,Transportation,Utilities,day,time_spent,to_investigate,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
608,316034,Amazon.com - Online Shopping,93.87,Online Shopping,2024-08-01,August,12816,spending,paid_out,Intrnl,bank_B,0.66,0.37,4,bank_B,0.61,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,2.0,1,0.0,2.0,0.0,0.0,0.0,2.0


In [24]:
# Amount gained from detecting scams
print("Gain from the detected scams is: ", sum(full2[(full2["to_investigate"] == 1) & (full2["is_scam"] == 1)]["Amount"]))

# Actual loss from undetected scams
print("Loss from scams undetected is: ", sum(full2[(full2["to_investigate"] == 0) & (full2["is_scam"] == 1)]["Amount"]))

Gain from the detected scams is:  11713.7
Loss from scams undetected is:  855.0


In [25]:
# Now we want to find the time taken to investigate any given case, based on their priority and based on whether it is shared.

# First, we create the matrix of time spent by each bank.
temp2 = np.zeros((np.shape(full2)[0], 6))
temp2 = pd.DataFrame(temp2, columns = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E', 'Intrnl']) 
bank_to_df = temp2.reset_index()
bank_to_df.update(full2.pivot(columns = 'bank_to_x', values = 'time_spent')) # Pivot to get the time spent for each bank_to
bank_to_df = bank_to_df.set_index('index').rename_axis(None)
bank_to_df
bank_from_df = temp2.reset_index()
bank_from_df.update(full2.pivot(columns = 'bank_from', values = 'time_spent')) # Pivot to get the time spent for each bank_from
bank_from_df = bank_from_df.set_index('index').rename_axis(None)
ultimate2 = bank_to_df/2 + bank_from_df/2 # Halve and add them: this forces the 50:50 split for sharing cases
ultimate2.loc[ultimate2["Intrnl"] > 0, :] *= 2 # Multiply the time taken by 2 for any bank which investigates alone when bank_to is International
full2 = full2.join(ultimate2) # Combine with full dataset
full2

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to_x,bank_from,transac_prob,description_prob,priority,bank_to_y,customer_prob,is_scam,Bank Fees,Charity,Credit Card Payment,Dining Out,Electronics,Entertainment,Groceries,Healthcare,Holiday,Home Improvement,Housing,Investment,Loan Payment,Online Shopping,Personal care,Shopping,Transfers,Transportation,Utilities,day,time_spent,to_investigate,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
0,315183,Utilities Payment - Gas and Electric,85.30,Utilities,2024-08-01,August,10003,spending,paid_out,bank_D,bank_A,0.45,0.35,2,bank_A,0.46,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.50,0,0.250,0.000,0.00,0.25,0.000,0.0
1,315184,Rent payment,1500.00,Housing,2024-08-01,August,10003,spending,paid_out,bank_E,bank_A,0.50,0.64,3,bank_A,0.46,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00,0,0.500,0.000,0.00,0.00,0.500,0.0
2,315186,Grocery Shopping - Fresh Mart,52.93,Groceries,2024-08-01,August,10025,spending,paid_out,bank_A,bank_A,0.39,0.61,1,bank_A,0.43,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.25,0,0.250,0.000,0.00,0.00,0.000,0.0
3,315187,Transfer to Investment Account,454.35,Transfers,2024-08-01,August,10025,spending,paid_out,bank_D,bank_A,0.72,0.23,3,bank_A,0.43,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1.00,0,0.500,0.000,0.00,0.50,0.000,0.0
4,315189,Rent payment,1500.00,Housing,2024-08-01,August,10025,spending,paid_out,bank_A,bank_A,0.74,0.11,3,bank_A,0.43,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00,0,1.000,0.000,0.00,0.00,0.000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1758,317648,Asos - Men's Clothing,41.16,Shopping,2024-08-01,August,19949,spending,paid_out,bank_B,bank_A,0.32,0.16,1,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0.25,1,0.125,0.125,0.00,0.00,0.000,0.0
1759,317650,Mobile Phone Bill - Service Provider,51.84,Utilities,2024-08-01,August,19949,spending,paid_out,bank_E,bank_A,0.55,0.16,1,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0.25,0,0.125,0.000,0.00,0.00,0.125,0.0
1760,317651,Rent Payment - Apt #203,1200.00,Housing,2024-08-01,August,19949,spending,paid_out,bank_E,bank_A,0.65,0.11,3,bank_A,0.25,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00,0,0.500,0.000,0.00,0.00,0.500,0.0
1761,317653,Rent payment,1500.00,Housing,2024-08-01,August,19951,spending,paid_out,bank_E,bank_C,0.77,0.59,3,bank_C,0.23,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1.00,0,0.000,0.000,0.50,0.00,0.500,0.0


In [47]:
# Find sum by bank by priority
temptemptemp2 = full2.loc[full2["to_investigate"]==1].groupby(["priority", "day"])
A_df = pd.DataFrame({'bank_A': temptemptemp2["bank_A"].sum()}).reset_index()
B_df = pd.DataFrame({'bank_B': temptemptemp2["bank_B"].sum()}).reset_index()
C_df = pd.DataFrame({'bank_C': temptemptemp2["bank_C"].sum()}).reset_index()
D_df = pd.DataFrame({'bank_D': temptemptemp2["bank_D"].sum()}).reset_index()
E_df = pd.DataFrame({'bank_E': temptemptemp2["bank_E"].sum()}).reset_index()

A_df["bank_B"] = B_df["bank_B"]
A_df["bank_C"] = C_df["bank_C"]
A_df["bank_D"] = D_df["bank_D"]
A_df["bank_E"] = E_df["bank_E"]
exp_work_df2 = A_df
print(exp_work_df2)
subtract_exp_work_df2 = exp_work_df2

# Subtract from low priority first
for b in Banks:
    remaining = teamsize[b]
    for p in Priorities:
        if (remaining > 0):
            if (subtract_exp_work_df2.iloc[p,2+b] - remaining < 0):
                remaining = remaining - subtract_exp_work_df2.iloc[p,2+b]
                subtract_exp_work_df2.iloc[p,2+b] = 0
            else: 
                subtract_exp_work_df2.iloc[p,2+b] = subtract_exp_work_df2.iloc[p,2+b] - remaining
                remaining = 0
print(subtract_exp_work_df2)
spending = 0
for p in Priorities:
    spending += sum(np.ceil(subtract_exp_work_df2.iloc[p, 2:7] / time[p])) * ext_cost[p]
spending

# Amount gained from detecting scams
print("Gain from the detected scams is: ", sum(full2[(full2["to_investigate"] == 1) & (full2["is_scam"] == 1)]["Amount"]))

# Actual loss from undetected scams
print("Loss from scams undetected is: ", sum(full2[(full2["to_investigate"] == 0) & (full2["is_scam"] == 1)]["Amount"]))

# Total loss after accounting for external investigators
print("Total loss + spending: ",  sum(full2[(full2["to_investigate"] == 0) & (full2["is_scam"] == 1)]["Amount"]) + spending)

   priority  day  bank_A  bank_B  bank_C  bank_D  bank_E
0         1    0   5.375    6.25   5.875   4.375   6.125
1         2    0   1.750    2.25   3.250   1.750   2.500
2         3    0   8.000    9.50   5.500   7.000  12.000
3         4    0   0.000    2.00   0.000   0.000   0.000
   priority  day  bank_A  bank_B  bank_C  bank_D  bank_E
0         1    0   0.000     0.0   0.000   0.000   0.000
1         2    0   0.000     0.0   0.000   0.000   0.000
2         3    0   7.125     6.0   4.625   3.125  10.625
3         4    0   0.000     2.0   0.000   0.000   0.000
Gain from the detected scams is:  11713.7
Loss from scams undetected is:  855.0
Total loss + spending:  4405.0


In [43]:
# Pre-processing step: don't bother hiring external investigators to investigate if those cases are more than the amount needed to investigate.


0.5

In [None]:
# How many cases each bank investigates

# Now we want to find the time taken to investigate any given case, based on their priority and based on whether it is shared.
finding_hours = np.zeros((n_daily_cases, 6))
finding_hours = pd.DataFrame(temp, columns = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E', 'Intrnl']) 

# for index, row in full.iterrows():
#     if (row["bank_from"] == row["bank_to"]):
#         temp[row]

# for i in range(n_daily_cases):
#     if (full["bank_from"][i] == full["bank_to"][i]):
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     elif full["bank_to"][i] == "Intrnl":
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     else:
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]/2
#             temp[full["bank_to"][i]][i] = time[full["priority"][i] - 1]/2
# temp
# full = full.join(temp)
# temp

bank_to_df = temp.reset_index()
bank_to_df.update(full.pivot(columns = 'bank_to', values = 'time_spent')) # Pivot to get the time spent for each bank_to
bank_to_df = bank_to_df.set_index('index').rename_axis(None)
bank_to_df

bank_from_df = temp.reset_index()
bank_from_df.update(full.pivot(columns = 'bank_from', values = 'time_spent')) # Pivot to get the time spent for each bank_from
bank_from_df = bank_from_df.set_index('index').rename_axis(None)

ultimate = bank_to_df/2 + bank_from_df/2 # Halve and add them: this forces the 50:50 split for sharing cases

ultimate.loc[ultimate["Intrnl"] > 0, :] *= 2 # Multiply the time taken by 2 for any bank which investigates alone when bank_to is International

full = full.join(ultimate) # Combine with full dataset
full

# How many external investigators

## Test with random day on first dataset

In [172]:
test_subset = full[full["day"] == 70]
test_subset.iloc[0,1:2]
test_vec = [amount_sol, transact_sol, cust_sol]
test_vec_mult_frame = test_subset.iloc[:,[2,11,15]] * vec
test_weight_sol_proc_mult_frame = test_subset.iloc[:,17:37] * weight_sol
test_joined_mult_frame = test_vec_mult_frame.join(test_weight_sol_proc_mult_frame)

test_combined_day2 = test_joined_mult_frame.sum(axis = 1)
test_combined_day2.loc[test_combined_day2 > 1] = 1
test_combined_day2.loc[test_combined_day2 < 0] = 0
test_sample = np.random.binomial(1,test_combined_day2)
len([i for i in range(len(test_sample)) if test_sample[i]==1])

76

# Perfect Loss

In [None]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(full)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob_perf_f = xp.problem('perf_f')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

prob_perf_f.addVariable(z, ext, shared, solo)

# Constraints
for i in Cases:
    for b in Banks:
        if (full["bank_from"][i] != bank_names[b]) & (full["bank_to"][i] != bank_names[b]):
            prob_perf_f.addConstraint(shared[i,b] == 0)
            prob_perf_f.addConstraint(solo[i,b] == 0)
            prob_perf_f.addConstraint(ext[i,b] == 0)
        elif (full["bank_to"][i] == "Intrnl") & (full["bank_from"][i] == bank_names[b]):
            prob_perf_f.addConstraint(shared[i,b] == 0)
            prob_perf_f.addConstraint(solo[i,b] == z[i])
            prob_perf_f.addConstraint(ext[i,b] <= solo[i,b])
        elif (full["bank_to"][i] == bank_names[b]) & (full["bank_from"][i] == bank_names[b]):
            prob_perf_f.addConstraint(shared[i,b] == 0)
            prob_perf_f.addConstraint(solo[i,b] == z[i])
            prob_perf_f.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob_perf_f.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob_perf_f.addConstraint(solo[i,b] == 0)
            prob_perf_f.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob_perf_f.addConstraint(
        sum(shared[i,b] * time[full["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[full["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[full["priority"][i]-1] for i in Cases))


# Objective function: maximising gain of amount investigated

prob_perf_f.setObjective(xp.Sum(full["is_scam"] * full["Amount"][i] * (1-z[i]) for i in Cases) +
                   xp.Sum(ext[i,b] * ext_cost[full["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.minimize)

# prob_perf_f.write("problem2 dayone","lp")

prob_perf_f.solve()
print(f'The objective function value is {prob_perf_f.getObjVal()}') 