# Data Cleaning and Exploration

In [1]:
import xpress as xp
import pandas as pd
import numpy as np
customer = pd.read_excel('231013_Customer_Base.xlsx')
fraud = pd.read_excel('231013_Fraud_Cases.xlsx')
transact = pd.read_excel('231013_Transactions_Input.xlsx')
# Merging Transaction dataset with Customer dataset by customer_id
full = transact.merge(customer, how = 'left', on='customer_id')

#### some code used for understanding the data
# transact.groupby('category')["category"].count().sort_values(ascending = False)
# print(full.iloc[:2])

# full[full.transac_prob.isna()].groupby('category')["category"].count().sort_values(ascending = False)
#### From this, we know that some of the transfers have type 'income'. These do not have transaction probabilities.

# full[full.transac_prob.isna()].groupby('In_or_Out')["category"].count().sort_values(ascending = False)
#### More generally, all Transfers, Income and Interest (and only these categories) are classified as paid_in. 
#### So an easy way to subset is to get rid of all paid_in.
#### Also, get rid of cash withdrawals.

# Remove all cash withdrawals, and all "paid_in" data from the dataset
cleaned = full[(full.category != "Cash Withdrawal") & (full.In_or_Out != "paid_in")]

In [2]:
# Subsetting to only the first day
dayone = cleaned[cleaned.date == "2023-10-01"]
dayone = dayone.reset_index(drop = True) # reset the index, so the indexing is 0, 1, 2, ... 
# drop = False would have made the old index into a new column in the dataframe. We don't need that, so drop = True.
# Resetting the indexing is needed so that the index of the decision variable corresponds to the index of the dataframe; this is useful for subsequent analysis
dayone

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,bank_E,0.78,0.25,3,bank_E,0.49
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,bank_E,0.30,0.22,1,bank_E,0.49
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,bank_E,0.57,0.53,3,bank_E,0.35
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,bank_E,0.75,0.65,3,bank_E,0.35
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,bank_E,0.37,0.31,1,bank_E,0.35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699,2410,Credit Card Payment - American Express **** 5678,175.00,Credit Card Payment,2023-10-01,October,17647,spending,paid_out,Intrnl,bank_D,0.66,0.36,4,bank_D,0.70
1700,2412,Home Improvement - Paint and Supplies,110.75,Home Improvement,2023-10-01,October,17031,spending,paid_out,bank_E,bank_A,0.43,0.46,2,bank_A,0.58
1701,2413,Rent Payment,1200.00,Housing,2023-10-01,October,17699,spending,paid_out,bank_D,bank_A,0.64,0.38,3,bank_A,0.66
1702,2414,Credit Card Payment - Mastercard **** 6789,150.50,Credit Card Payment,2023-10-01,October,17390,spending,paid_out,Intrnl,bank_D,0.66,0.68,4,bank_D,0.45


In [3]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob = xp.problem('fraud')

# Decision Variable
invstg = np.array([xp.var(vartype=xp.binary, name='invstg_{0}_{1}'.format(i+1, j+1))
                    for i in Cases for j in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

print(invstg)

prob.addVariable(invstg)

Using the license file found in your Xpress installation. If you want to use this license and no longer want to see this message, use the following code before using the xpress module:
  xpress.init('C:/xpressmp/bin/xpauth.xpr')
[[invstg_1_1 invstg_1_2 invstg_1_3 invstg_1_4 invstg_1_5]
 [invstg_2_1 invstg_2_2 invstg_2_3 invstg_2_4 invstg_2_5]
 [invstg_3_1 invstg_3_2 invstg_3_3 invstg_3_4 invstg_3_5]
 ...
 [invstg_1702_1 invstg_1702_2 invstg_1702_3 invstg_1702_4 invstg_1702_5]
 [invstg_1703_1 invstg_1703_2 invstg_1703_3 invstg_1703_4 invstg_1703_5]
 [invstg_1704_1 invstg_1704_2 invstg_1704_3 invstg_1704_4 invstg_1704_5]]


### Simplified constraints: no external investigators, only bank_from can investigate

In [4]:
# Constraints

for b in Banks:
    prob.addConstraint(
        sum(invstg[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= teamsize[b])

for i in Cases:
    for b in Banks:
        if dayone["bank_from"][i] != bank_names[b]:
            invstg[i,b] = 0



In [5]:
# Objective function: minimising loss of non-investigated cases

prob.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * (xp.Sum(invstg[i,b] for b in Banks)) for i in Cases), 
                  sense = xp.maximize)

In [6]:
prob.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:11:43, Nov 20, 2023
Heap usage: 2745KB (peak 2825KB, 232KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
         5 rows         8520 cols         8520 elements      8520 entities
Presolved problem has:
         5 rows          358 cols          358 elements       358 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 5905KB (peak 8906KB, 232KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 2.50e-01,  2.00e+00] / [ 1.25e-01,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  6.00e+00]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 1.30e+01,  9.05e+02]
Autoscaling applied standard scaling

Will try to keep branch and bound tree memory usage below 8.6GB
Fixed 273 (of 273) columns in 4 (of 4) subproblems

 *** Solution foun

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [7]:
print(f'The objective function value is {prob.getObjVal()}') 

The objective function value is 36104.670000000006


In [8]:
invstg_df = pd.DataFrame(data = prob.getSolution(invstg), index = Cases, columns = bank_names)

# This dataframe shows which bank investigated which case.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
invstg_df = invstg_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
invstg_df = invstg_df[cols]

# All cases which were actually investigated
invstg_df_yes = invstg_df[invstg_df[bank_names].sum(axis=1) == 1]

# # Uncomment the following code to display ALL invstg_df decision variables:
# invstg_df_disp = invstg_df.style.set_caption('Investigation Decisions').format(precision=0)
# display(invstg_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# invstg_df_yes_disp = invstg_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(invstg_df_yes_disp)

### Naive model success rate, and comparison with Perfect Information case 

In [9]:
# Actual money lost from fraud: 
d1frauds = dayone.loc[dayone["transaction_id"].isin(fraud["transaction_id"])] # find all the rows in 'dayone' which are frauds by matching transaction_id

d1frauds = d1frauds.loc[~d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])] # filter out all the rows which were investigated

# print out all the frauds that were caught:
d1frauds.loc[d1frauds["transaction_id"].isin(invstg_df_yes["transaction_id"])]
# An empty data frame is printed. I.e. NONE of the actual fraud cases were caught.

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob


In [10]:
money_lost = d1frauds["Amount"].sum(axis=0)
money_lost

# Under perfect information, you would have investigated and found £1418.75 of scams

1418.7500000000002

## Model 2: GOALS:
### - Day one: minimising the expected loss
### - Subsequent days: minimising the actual loss of all previous days
### - Constraints: 50/50 split for all investigations involving two banks, external investigators included, international tasks completed within one day (by having multiple people work on it)
### - Decision variable: Weights on probabilities

## Currently, everything below has the correct constraints, but only minimises expected loss on day one, and has decision variable as binary. Hence, what needs to be done is (1) do the model for subsequent days, and (2) change the decision variable to be based on some updated weights on the probabilities

In [11]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob2 = xp.problem('fraud2')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)



# print(invstg)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

prob2.addVariable(z, ext, shared, solo)


In [12]:
# Constraints
for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob2.addConstraint(shared[i,b] == 0)
            prob2.addConstraint(solo[i,b] == z[i])
            prob2.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob2.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob2.addConstraint(solo[i,b] == 0)
            prob2.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob2.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [13]:
# Objective function: maximising gain of amount investigated

prob2.setObjective(xp.Sum(dayone["transac_prob"][i] * dayone["customer_prob"][i] * dayone["Amount"][i] * z[i] for i in Cases) - 
                   xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)

prob2.write("problem2 dayone","lp")

prob2.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:11:46, Nov 20, 2023
Heap usage: 14MB (peak 14MB, 5577KB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     25565 rows        27264 cols        57166 elements     27264 entities
Presolved problem has:
      2643 rows         4246 cols        10841 elements      4246 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 18MB (peak 36MB, 5577KB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.25e-01,  2.00e+00] / [ 6.25e-02,  1.00e+00]
  RHS and bounds [min,max] : [ 1.00e+00,  1.20e+01] / [ 1.00e+00,  2.70e+01]
  Objective      [min,max] : [ 2.02e-01,  9.05e+02] / [ 2.02e-01,  9.05e+02]
Autoscaling applied standard scaling

Symmetric problem: generators: 12, support set: 72
 Number of orbits: 36, largest orbit: 2
 Row orbits: 24, row support: 48
Will try to keep

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [14]:
print(f'The objective function value is {prob2.getObjVal()}') 

The objective function value is 171803.1344750016


In [15]:
ext_df = pd.DataFrame(data = prob2.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob2.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob2.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob2.getSolution(z), index = Cases)

# This dataframe shows which bank SHARED investigations.

# NOTE: The indexing on this dataframe is not the transaction_id! It corresponds to the index of the original dayone df.
# Hence for extra clarity, I append a column of the actual transaction ID to the investigated cases df.
shared_df = shared_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
shared_df = shared_df[cols]

# All cases which were actually investigated
shared_df_yes = shared_df[shared_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL decision variables:
#shared_df_disp = shared_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(shared_df_disp)

# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# shared_df_yes_disp = shared_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(shared_df_yes_disp)

# This dataframe shows which bank did a SOLO investigation
solo_df = solo_df.join(dayone["transaction_id"])

# Extra code for aesthetics: moving transaction_id to the left
cols = ['transaction_id'] + bank_names
solo_df = solo_df[cols]

# All cases which were actually investigated
# solo_df_yes = solo_df[solo_df[bank_names].sum(axis=1) >= 1]

# # Uncomment the following code to display ALL solo decision variables:
#solo_df_disp = solo_df.style.set_caption('Investigation Decisions').format(precision=0)
#display(solo_df_disp)


# Uncomment the following code to display all the cases INVESTIGATED in a nicely-formatted style:
# solo_df_yes_disp = solo_df_yes.style.set_caption('Cases Investigated').format(precision=0)
# display(solo_df_yes_disp)

In [16]:
sum(prob2.getSolution(ext))

array([114., 113.,  99.,  96.,  89.])

## Day Two

In [17]:
# Perfect info on day one
dayone["is_scam"] = dayone["transaction_id"].isin(fraud["transaction_id"])
dayone["is_scam"] = dayone["is_scam"].astype(int)

In [18]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

# Index Sets
n_banks = 5
n_daily_cases = np.shape(dayone)[0]
Cases = range(n_daily_cases)
Banks = range(n_banks)

# Problem set-up
prob_p_1 = xp.problem('prob_p_1')

# Decision Variable


# z forces the 50:50 split
z = np.array([xp.var(vartype=xp.binary, name='z_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)

# no. of external investigators hired by bank b
ext = np.array([xp.var(vartype=xp.binary, name='ext_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)

# indicator variable for when a case is shared
shared = np.array([xp.var(vartype=xp.binary, name='shared_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
solo = np.array([xp.var(vartype=xp.binary, name='solo_{0}_{1}'.format(i+1, b+1))
                    for i in Cases for b in Banks], dtype=xp.npvar).reshape(n_daily_cases, n_banks)
m_1 = 14000
transact_w = xp.var(vartype = xp.continuous, name='transact_w')
cust_w = xp.var(vartype = xp.continuous, name='cust_w')
amount_w = xp.var(vartype = xp.continuous, name='amount_w')
thres = xp.var(vartype = xp.continuous, name='thres')

aux = np.array([xp.var(vartype = xp.integer, name='aux_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)
aux2 = np.array([xp.var(vartype = xp.integer, name='aux2_{0}'.format(i+1))
                    for i in Cases], dtype=xp.npvar).reshape(n_daily_cases)


prob_p_1.addVariable(z, ext, shared, solo, transact_w, cust_w, amount_w, thres, aux, aux2)


In [19]:
# Constraints

prob_sum = transact_w + cust_w + amount_w == 1
invest1 = [thres - dayone["transac_prob"][i] * transact_w - dayone["customer_prob"][i] * cust_w - dayone["Amount"][i] * amount_w <= m_1*(1-aux[i]) for i in Cases]
invest2 = [z[i] <= aux[i] for i in Cases]
invest3 = [dayone["transac_prob"][i] * transact_w + dayone["customer_prob"][i] * cust_w + dayone["Amount"][i] * amount_w - thres <= m_1*(1-aux2[i]) for i in Cases]
invest4= [1 - z[i] <= aux2[i] for i in Cases]

prob_p_1.addConstraint(prob_sum, invest1, invest2, invest3, invest4)

    
for i in Cases:
    if (dayone["transac_prob"][i] <= 0.4) | (dayone["customer_prob"][i] <= 0.4):
        for b in Banks:
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)

for i in Cases:
    for b in Banks:
        if (dayone["bank_from"][i] != bank_names[b]) & (dayone["bank_to"][i] != bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] == 0)
        elif (dayone["bank_to"][i] == "Intrnl") & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        elif (dayone["bank_to"][i] == bank_names[b]) & (dayone["bank_from"][i] == bank_names[b]):
            prob_p_1.addConstraint(shared[i,b] == 0)
            prob_p_1.addConstraint(solo[i,b] == z[i])
            prob_p_1.addConstraint(ext[i,b] <= solo[i,b])
        else:
            prob_p_1.addConstraint(shared[i,b] == z[i]) # this forces the 50:50 split
            prob_p_1.addConstraint(solo[i,b] == 0)
            prob_p_1.addConstraint(ext[i,b] <= shared[i,b])

for b in Banks:
    prob_p_1.addConstraint(
        sum(shared[i,b] * time[dayone["priority"][i]-1]/2 for i in Cases) +
         sum(solo[i,b] * time[dayone["priority"][i]-1] for i in Cases) <= 
         teamsize[b] + sum(ext[i,b] * time[dayone["priority"][i]-1] for i in Cases))


In [20]:
prob_p_1.setObjective(xp.Sum(z[i] * dayone["is_scam"][i] * dayone["Amount"][i] for i in Cases) - xp.Sum(ext[i,b] * ext_cost[dayone["priority"][i] - 1] for i in Cases for b in Banks), 
                   sense = xp.maximize)


In [21]:
prob_p_1.solve()

FICO Xpress v9.2.2, Hyper, solve started 12:11:51, Nov 20, 2023
Heap usage: 22MB (peak 22MB, 14MB system)
Maximizing MILP noname using up to 8 threads and up to 15GB memory, with these control settings:
OUTPUTLOG = 1
Original problem has:
     45597 rows        30676 cols        94240 elements     30672 entities
Presolved problem has:
      3946 rows         2291 cols        15744 elements      2287 entities
LP relaxation tightened
Presolve finished in 0 seconds
Heap usage: 26MB (peak 48MB, 14MB system)

Coefficient range                    original                 solved        
  Coefficients   [min,max] : [ 1.00e-01,  1.40e+04] / [ 2.50e-03,  4.69e+01]
  RHS and bounds [min,max] : [ 1.00e+00,  1.40e+04] / [ 9.53e-03,  9.38e+01]
  Objective      [min,max] : [ 2.21e+01,  2.45e+02] / [ 4.00e+01,  2.45e+02]
Autoscaling applied Curtis-Reid scaling

Symmetric problem: generators: 2, support set: 12
 Number of orbits: 6, largest orbit: 2
 Row orbits: 8, row support: 16
Will try to keep bra

(<SolveStatus.COMPLETED: 3>, <SolStatus.OPTIMAL: 1>)

In [22]:
thres = prob_p_1.getSolution(thres)
transact_w = prob_p_1.getSolution(transact_w)
cust_w = prob_p_1.getSolution(cust_w)
amount_w = prob_p_1.getSolution(amount_w)
print(transact_w, ",", cust_w, ",", amount_w, ",", thres)

print(f'The objective function value is {prob_p_1.getObjVal()}')

ext_df = pd.DataFrame(data = prob_p_1.getSolution(ext), index = Cases, columns = bank_names)
shared_df = pd.DataFrame(data = prob_p_1.getSolution(shared), index = Cases, columns = bank_names)
solo_df = pd.DataFrame(data = prob_p_1.getSolution(solo), index = Cases, columns = bank_names)
z_df = pd.DataFrame(data = prob_p_1.getSolution(z), index = Cases, columns = ["investigated"])

0.21222147622462378 , 0.7877785237753763 , 0.0 , 0.7860114766499344
The objective function value is 369.65


In [23]:
# Investigated cases in day one based on perfect information and calculated probabilities
# test = pd.concat([dayone,z_df], axis = 1)
# test[(test["investigated"] == 1) & (test["is_scam"] == 1)]

In [24]:
# Using probabilities for day 2:
daytwo = cleaned[cleaned.date == "2023-10-02"]
daytwo = daytwo.reset_index(drop = True) 

In [25]:
def calc(row):
    return int(row["transac_prob"] * transact_w + row["customer_prob"] * cust_w + row["Amount"] * amount_w >= thres)

In [26]:
daytwo["investigated"] = daytwo.apply(calc, axis = 1)
daytwo[(daytwo["investigated"] == 1)]

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,bank_from,transac_prob,description_prob,priority,home_bank,customer_prob,investigated
165,2604,Utilities Payment - Gas and Electric,95.0,Utilities,2023-10-02,October,13834,spending,paid_out,bank_B,bank_B,0.65,0.22,2,bank_B,0.86,1
210,2654,Restaurant Dinner - Date Night,90.0,Dining Out,2023-10-02,October,14951,spending,paid_out,bank_B,bank_A,0.46,0.2,2,bank_A,0.89,1
213,2657,Utilities Payment - Electricity,80.0,Utilities,2023-10-02,October,14951,spending,paid_out,bank_D,bank_A,0.52,0.63,1,bank_A,0.89,1
327,2798,Internet Service Payment - Provider Name,70.0,Utilities,2023-10-02,October,16194,spending,paid_out,bank_C,bank_B,0.53,0.64,1,bank_B,0.88,1
1022,3653,Transfer to Vacation Fund,350.0,Transfers,2023-10-02,October,13834,spending,paid_out,bank_D,bank_B,0.56,0.15,3,bank_B,0.86,1
1034,3667,Utilities Payment - Water and Sewer,60.75,Utilities,2023-10-02,October,11970,spending,paid_out,bank_D,bank_E,0.56,0.3,1,bank_E,0.86,1


# Full model

In [27]:
cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"])
cleaned["is_scam"] = cleaned["is_scam"].astype(int)

one_hot = pd.get_dummies(cleaned['category'])
one_hot = one_hot.astype(int)
full = cleaned.join(one_hot)
full = full.reset_index(drop = True) 

full["day"] = (full["date"] - full["date"][0]).dt.days
# full = full[full["day"] <= 100]

diction = {1: 0.25,
           2: 0.5,
           3: 1,
           4: 2
}

# for i in range(np.shape(full)[0]):
full["time_spent"] = full["priority"]
full = full.replace({"time_spent":diction})
full

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["transaction_id"].isin(fraud["transaction_id"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned["is_scam"] = cleaned["is_scam"].astype(int)


Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Loan Payment,Online Shopping,Personal care,Shopping,Streaming Services,Transfers,Transportation,Utilities,day,time_spent
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,1,0,0,0,0,0,0,0,1.00
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,0,0,0,0,0,0,1,0,0.25
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,0,0,0,0,0,0,1.00
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,1,0,0,0,0,0,0,0,1.00
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,1,0,0,0,0,0,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,0,0,0,0,0,0,304,0.25
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,0,0,0,0,0,0,304,0.25
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,0,1,0,0,0,0,304,0.25
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,0,0,0,0,0,0,1,0,304,0.25


In [28]:
# Data
teamsize = [8, 12, 10, 10, 10]
bank_names = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E']

time = [0.25, 0.5, 1, 2]
ext_cost = [40, 60, 100, 150]

n_priority = len(time)
Priorities = range(n_priority)
# Index Sets
n_days = len(pd.unique(full["date"]))
Days = range(n_days)
n_banks = 5
Banks = range(n_banks)

n_daily_cases = np.shape(full)[0]
Cases = range(n_daily_cases)

Categories = range(np.shape(one_hot)[1])

temp = np.zeros((n_daily_cases, 6))
temp = pd.DataFrame(temp, columns = ['bank_A', 'bank_B', 'bank_C', 'bank_D', 'bank_E', 'Intrnl'])

# for index, row in full.iterrows():
#     if (row["bank_from"] == row["bank_to"]):
#         temp[row]


# for i in range(n_daily_cases):
#     if (full["bank_from"][i] == full["bank_to"][i]):
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     elif full["bank_to"][i] == "Intrnl":
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]
#     else:
#             temp[full["bank_from"][i]][i] = time[full["priority"][i] - 1]/2
#             temp[full["bank_to"][i]][i] = time[full["priority"][i] - 1]/2
# temp
# full = full.join(temp)
# temp

bank_to_df = temp.reset_index()
bank_to_df.update(full.pivot(columns = 'bank_to', values = 'time_spent'))
bank_to_df = bank_to_df.set_index('index').rename_axis(None)
bank_to_df

bank_from_df = temp.reset_index()
bank_from_df.update(full.pivot(columns = 'bank_from', values = 'time_spent'))
bank_from_df = bank_from_df.set_index('index').rename_axis(None)

ultimate = bank_to_df/2 + bank_from_df/2
ultimate[ultimate["Intrnl"] > 0]

ultimate.loc[ultimate["Intrnl"] > 0, :] *= 2

full = full.join(ultimate)
full

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Transportation,Utilities,day,time_spent,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,0,0,1.00,0.500,0.000,0.000,0.000,0.500,0.0
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,1,0,0.25,0.000,0.000,0.125,0.000,0.125,0.0
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,1.00,0.000,0.000,0.000,0.500,0.500,0.0
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,0,0,1.00,0.000,0.000,0.000,0.000,1.000,0.0
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,0.25,0.000,0.125,0.000,0.000,0.125,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125,0.000,0.000,0.125,0.000,0.0
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,304,0.25,0.000,0.125,0.125,0.000,0.000,0.0
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125,0.000,0.000,0.125,0.000,0.0
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,1,0,304,0.25,0.000,0.000,0.000,0.250,0.000,0.0


In [37]:
full.iloc[1][0]

3

In [29]:
# Problem set-up
prob_full = xp.problem('prob_full')
prob_full.setControl('outputlog', 0) # suppress output

#### Decision Variables forces the 50:50 split

weight_f = np.array([xp.var(vartype = xp.continuous, name = 'weight_{0}'.format(i+1)) 
                    for i in Categories], dtype = xp.npvar)

transact_w_f = xp.var(vartype = xp.continuous, name='transact_w_f')
cust_w_f = xp.var(vartype = xp.continuous, name='cust_w_f')
thres = xp.var(vartype = xp.continuous, name='thres_f')

d = np.array([xp.var(vartype = xp.continuous, name = 'd_{0}'.format(i+1))
              for i in Cases], dtype = xp.npvar)
di = np.array([xp.var(vartype = xp.continuous, name = 'di_{0}'.format(i+1))
              for i in Cases], dtype = xp.npvar)
internal = np.array([xp.var(vartype = xp.binary, name = 'internal_{0}_{1}_{2}'.format(i+1, j+1, k+1))
              for i in Days for j in Banks for k in Priorities], dtype = xp.npvar).reshape(n_days, n_banks, n_priority)
external = np.array([xp.var(vartype = xp.binary, name = 'external_{0}_{1}_{2}'.format(i+1, j+1, k+1))
              for i in Days for j in Banks for k in Priorities], dtype = xp.npvar).reshape(n_days, n_banks, n_priority)

z = np.array([xp.var(vartype = xp.continuous, name = 'z_{0}'.format(i+1))
              for i in Cases], dtype = xp.npvar)

prob_full.addVariable(weight_f, cust_w_f, transact_w_f, thres, d, internal, external)

#### Constraints
d_cons = [d[i] == xp.Sum(weight_f[c] * full.iloc[i, 17+c] for c in Categories) + transact_w_f * full["transac_prob"][i] + cust_w_f * full["customer_prob"][i] for i in Cases]
probab_invst_cons = [d[i] <= 1 for i in Cases]
external_cons = [external[d,b,p] <= internal[d,b,p] for d in Days for b in Banks for p in Priorities]
internal_cons = [internal[d,b,p] <= d[i]]
prob_full.addConstraint(thres <= 1)
prob_full.addConstraint(thres - d[i] <= 1 - z[i] for i in Cases)
prob_full.addConstraint(internal[full.iloc[i]["day"], full.iloc[i]["day"] ] <= z[i] for i in Cases)


prob_full.addConstraint(d_cons, probab_invst_cons, external_cons, internal_cons)

In [30]:
for i in Cases:
    for b in Banks:
        full[bank_names[b]][i] = full[bank_names[b]][i] * d[i]
full

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * d[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * d[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  full[bank_names[b]][i] = full[bank_names[b]][i] * d[i]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ful

Unnamed: 0,transaction_id,description,Amount,category,date,month,customer_id,type,In_or_Out,bank_to,...,Transportation,Utilities,day,time_spent,bank_A,bank_B,bank_C,bank_D,bank_E,Intrnl
0,2,Online Retailer - Electronics Purchase,199.99,Online Shopping,2023-10-01,October,10298,spending,paid_out,bank_A,...,0,0,0,1.00,0.5*d_1,0.0,0.0,0.0,0.5*d_1,0.0
1,3,Utilities Payment - Internet Service,60.00,Utilities,2023-10-01,October,10298,spending,paid_out,bank_C,...,0,1,0,0.25,0.0,0.0,0.125*d_2,0.0,0.125*d_2,0.0
2,4,Rent Payment,1500.00,Housing,2023-10-01,October,14507,spending,paid_out,bank_D,...,0,0,0,1.00,0.0,0.0,0.0,0.5*d_3,0.5*d_3,0.0
3,6,Online Retailer - Electronics Purchase,299.99,Online Shopping,2023-10-01,October,14507,spending,paid_out,bank_E,...,0,0,0,1.00,0.0,0.0,0.0,0.0,d_4,0.0
4,7,Asos - Online Fashion Shopping,55.00,Shopping,2023-10-01,October,14507,spending,paid_out,bank_B,...,0,0,0,0.25,0.0,0.125*d_5,0.0,0.0,0.125*d_5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238152,315177,Home Improvement Store - DIY Supplies,75.40,Home Improvement,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125*d_238153,0.0,0.0,0.125*d_238153,0.0,0.0
238153,315178,Grocery Shopping - Super Value Mart,55.25,Groceries,2024-07-31,July,15026,spending,paid_out,bank_C,...,0,0,304,0.25,0.0,0.125*d_238154,0.125*d_238154,0.0,0.0,0.0
238154,315179,JCPenney - Discount Fashion,40.00,Shopping,2024-07-31,July,11908,spending,paid_out,bank_D,...,0,0,304,0.25,0.125*d_238155,0.0,0.0,0.125*d_238155,0.0,0.0
238155,315180,Gas Station - Fuel Purchase,40.45,Transportation,2024-07-31,July,10291,spending,paid_out,bank_D,...,1,0,304,0.25,0.0,0.0,0.0,0.25*d_238156,0.0,0.0


In [31]:
temptemptemp = full.groupby(["priority", "day"])

In [32]:
A_df = pd.DataFrame({'bank_A': temptemptemp["bank_A"].sum()}).reset_index()
B_df = pd.DataFrame({'bank_B': temptemptemp["bank_B"].sum()}).reset_index()
C_df = pd.DataFrame({'bank_C': temptemptemp["bank_C"].sum()}).reset_index()
D_df = pd.DataFrame({'bank_D': temptemptemp["bank_D"].sum()}).reset_index()
E_df = pd.DataFrame({'bank_E': temptemptemp["bank_E"].sum()}).reset_index()

A_df["bank_B"] = B_df["bank_B"]
A_df["bank_C"] = C_df["bank_C"]
A_df["bank_D"] = D_df["bank_D"]
A_df["bank_E"] = E_df["bank_E"]
exp_work_df = A_df
exp_work_df

Unnamed: 0,priority,day,bank_A,bank_B,bank_C,bank_D,bank_E
0,1,0,0.125 d_7 +0.125 d_8 +0.125 d_9 +0.125 d_11 +...,0.125 d_5 +0.125 d_24 +0.125 d_25 +0.125 d_29...,0.125 d_2 +0.125 d_13 +0.25 d_18 +0.125 d_22 ...,0.125 d_27 +0.125 d_28 +0.125 d_29 +0.125 d_3...,0.125 d_2 +0.125 d_5 +0.25 d_6 +0.125 d_7 +0....
1,1,1,0.25 d_1707 +0.125 d_1711 +0.125 d_1713 +0.12...,0.125 d_1705 +0.25 d_1712 +0.125 d_1718 +0.12...,0.25 d_1709 +0.125 d_1713 +0.125 d_1715 +0.12...,0.125 d_1705 +0.125 d_1711 +0.125 d_1722 +0.1...,0.125 d_1715 +0.125 d_1718 +0.125 d_1719 +0.1...
2,1,2,0.125 d_2840 +0.25 d_2841 +0.125 d_2844 +0.12...,0.125 d_2843 +0.125 d_2845 +0.125 d_2850 +0.1...,0.125 d_2843 +0.125 d_2852 +0.125 d_2857 +0.1...,0.125 d_2840 +0.25 d_2842 +0.125 d_2857 +0.12...,0.125 d_2844 +0.125 d_2845 +0.125 d_2849 +0.1...
3,1,3,0.125 d_3985 +0.125 d_3987 +0.125 d_3995 +0.2...,0.125 d_3987 +0.125 d_3990 +0.125 d_3993 +0.1...,0.125 d_3993 +0.125 d_3994 +0.125 d_4003 +0.1...,0.25 d_3999 +0.125 d_4001 +0.125 d_4002 +0.12...,0.125 d_3985 +0.125 d_3990 +0.125 d_3998 +0.1...
4,1,4,0.25 d_4904 +0.125 d_4907 +0.125 d_4908 +0.12...,0.125 d_4905 +0.125 d_4907 +0.125 d_4913 +0.1...,0.25 d_4906 +0.125 d_4910 +0.125 d_4915 +0.12...,0.125 d_4908 +0.125 d_4909 +0.125 d_4912 +0.1...,0.125 d_4905 +0.125 d_4909 +0.125 d_4910 +0.1...
...,...,...,...,...,...,...,...
1215,4,300,2*d_234933,2 d_234773 +2 d_234854 +2 d_235227,2 d_234643 +2 d_234787 +2 d_235139,2*d_234947,2 d_234713 +2 d_235157
1216,4,301,2*d_236002,2 d_235554 +2 d_235618 +2 d_236028,0.0,0.0,2 d_235449 +2 d_235874 +2 d_235971
1217,4,302,2 d_236320 +2 d_236478 +2 d_236512 +2 d_23667...,2 d_236055 +2 d_236277,2 d_236212 +2 d_236548,2 d_236378 +2 d_236601,0.0
1218,4,303,2 d_237360 +2 d_237387 +2 d_237400,2 d_236853 +2 d_237208,0.0,2*d_236823,0.0


In [33]:
# bigarray = np.empty(len(time), n_days, n_banks)
# bigarray[:] = np.nan
# bigarray

# for d in Days:
#     for p in range(len(time)):
#         for b in Banks:
#             bigarray[p,d,b] = full.groupby(["priority", "day"])[bank_names[b]].sum()[p,d]
# bigarray


In [34]:
for d in Days:
    for b in Banks:
        for p in Priorities:
            prob_full.addConstraint(exp_work_df.iloc[(p-1)*(305) + d, b+2] <= internal[d,b,p] + external[d,b,p])

# for d in Days:
#     for b in Banks:
#         prob_full.addConstraint(xp.Sum(internal[d,b,p] for p in Priorities) == teamsize[b])

In [35]:
# for each day, for each bank, for each priority, find the expected investigation time. Then for each day for each bank, minus off the investigators you have starting from priority 1.
# whatever is remaining is allocated to expected no. of external investigators



# for p in range(len(time)):
#     subset = full[full["priority"] == p+1]
#     for d in Days:
#         for b in Banks:
#             bigarray[d,b,p] = xp.Sum(d[i] * full[bank_names[b]][i] for i in Cases if full["day"][i] = d)



# expected cost of external investigator, summed for each day for each bank

# prob_full.setObjective(xp.Sum() + xp.Sum((1-d[i]) * full["is_scam"] * full["Amount"] for i in Cases), sense = xp.maximize)

# prob_full.solve()

# transact_w = prob_full.getSolution(transact_w)
# cust_w = prob_full.getSolution(cust_w)