In [3]:
import pandas as pd
import numpy as np
import tqdm

In [4]:
! ls 

discrete_optimisation.ipynb        marketing_campaign_estimations.csv
generate_data.ipynb                net_earnings_eu.tsv
iso_codes.txt                      population.tsv


In [5]:
df = pd.read_csv('marketing_campaign_estimations.csv', sep = '\t')

In [6]:
df.head()

Unnamed: 0,country,channel,users,cs_contacts,marketing_spending,revenue
0,Austria,social networks,2483,113,1348442.81,5947157.45
1,Belgium,social networks,3318,173,2800790.16,9100510.86
2,Bulgaria,social networks,2168,115,130578.64,626140.08
3,Switzerland,social networks,2681,180,1781765.79,10498340.23
4,Cyprus,social networks,263,0,73398.04,274979.65


In [7]:
df['segment'] = df.country + ' - ' + df.channel

In [8]:
len(df.segment)

62

In [9]:
df.cs_contacts.sum()

9489

In [10]:
df.head()

Unnamed: 0,country,channel,users,cs_contacts,marketing_spending,revenue,segment
0,Austria,social networks,2483,113,1348442.81,5947157.45,Austria - social networks
1,Belgium,social networks,3318,173,2800790.16,9100510.86,Belgium - social networks
2,Bulgaria,social networks,2168,115,130578.64,626140.08,Bulgaria - social networks
3,Switzerland,social networks,2681,180,1781765.79,10498340.23,Switzerland - social networks
4,Cyprus,social networks,263,0,73398.04,274979.65,Cyprus - social networks


In [11]:
df.channel.value_counts()

channel
social networks    31
influencers        31
Name: count, dtype: int64

In [12]:
total_stats_df = df.groupby('channel', as_index = False).agg(
    {'users': 'sum', 'marketing_spending': 'sum', 'revenue': 'sum', 'cs_contacts': 'sum'}
)

In [13]:
total_stats_df['marketing_spending'] = total_stats_df.marketing_spending/10**6
total_stats_df['revenue'] = total_stats_df.revenue/10**6

total_stats_df

Unnamed: 0,channel,users,marketing_spending,revenue,cs_contacts
0,influencers,35735,18.076456,57.520957,1725
1,social networks,168030,45.769147,147.954766,7764


### Brute force 

In [14]:
2**62/10**18

4.611686018427388

In [15]:
import itertools

combinations = []
segments = list(df.segment.values)[:15]
print('number of segments: ', len(segments))
for num_items in range(len(segments) + 1):
    combinations.extend(
        itertools.combinations(segments, num_items)
    )

print('number of combinations: ', len(combinations))

number of segments:  15
number of combinations:  32768


In [16]:
tmp = []

for selected in tqdm.tqdm(combinations):
    tmp_df = df[df.segment.isin(selected)]
    tmp.append(
        {
        'selected_segments': ', '.join(selected),
        'users': tmp_df.users.sum(),
        'cs_contacts': tmp_df.cs_contacts.sum(),
        'marketing_spending': tmp_df.marketing_spending.sum(),
        'revenue': tmp_df.revenue.sum()
        }
    )

100%|███████████████████████████████████| 32768/32768 [00:04<00:00, 6829.19it/s]


In [17]:
2**62 / 7000 / 3600 / 24 / 365

20890800.619824003

In [18]:
res_df = pd.DataFrame(tmp)

In [19]:
res_df.sort_values('revenue', ascending = False)

Unnamed: 0,selected_segments,users,cs_contacts,marketing_spending,revenue
32767,"Austria - social networks, Belgium - social ne...",88318,4063,27467731.24,89218064.93
32762,"Austria - social networks, Belgium - social ne...",88055,4063,27394333.20,88943085.28
32758,"Austria - social networks, Belgium - social ne...",87921,4054,27354050.29,88919771.04
32698,"Austria - social networks, Belgium - social ne...",87658,4054,27280652.25,88644791.39
32764,"Austria - social networks, Belgium - social ne...",86150,3948,27337152.60,88591924.85
...,...,...,...,...,...
3,Bulgaria - social networks,2168,115,130578.64,626140.08
69,"Cyprus - social networks, Estonia - social net...",660,9,187078.99,573273.54
9,Estonia - social networks,397,9,113680.95,298293.89
5,Cyprus - social networks,263,0,73398.04,274979.65


### Naive approach

In [20]:
df['revenue_per_spend'] = df.revenue / df.marketing_spending 
df = df.sort_values('revenue_per_spend', ascending = False)
df['spend_cumulative'] = df.marketing_spending.cumsum()
df[df.spend_cumulative <= 30000000].revenue.sum()/1000000

107.91584679

In [21]:
df[df.spend_cumulative <= 30000000].shape[0]

48

### Merging segments

In [22]:
df['share_of_revenue'] = df.revenue/df.revenue.sum() * 100

In [23]:
# ! pip install matplotlib

In [24]:
from matplotlib import pyplot

In [34]:
# df.share_of_cs_contacts.hist(bins = 10)

In [35]:
df.sort_values('share_of_revenue').head(10)

Unnamed: 0,country,channel,users,cs_contacts,marketing_spending,revenue,segment,revenue_per_spend,spend_cumulative,share_of_revenue
33,Bulgaria,influencers,75,3,11178.75,38001.0,Bulgaria - influencers,3.399396,16989049.5,0.018494
49,Lithuania,influencers,88,3,17395.84,52408.4,Lithuania - influencers,3.012697,29234846.71,0.025506
57,Romania,influencers,89,4,17801.78,72497.62,Romania - influencers,4.072493,5254066.33,0.035283
45,Hungary,influencers,106,8,20981.64,74415.18,Hungary - influencers,3.546681,9422126.91,0.036216
21,Malta,social networks,144,5,19540.8,74854.08,Malta - social networks,3.830656,6180321.5,0.03643
60,Slovakia,influencers,109,8,40242.8,123362.93,Slovakia - influencers,3.065466,27174990.14,0.060038
39,Estonia,influencers,103,5,38026.57,131537.18,Estonia - influencers,3.459086,10264324.63,0.064016
52,Malta,influencers,123,8,46033.98,152977.56,Malta - influencers,3.323144,17035083.48,0.07445
59,Slovenia,influencers,100,5,44757.0,169291.0,Slovenia - influencers,3.782447,6651751.81,0.08239
16,Iceland,social networks,109,1,64267.49,202152.49,Iceland - social networks,3.145486,20698459.62,0.098383


In [36]:
df['segment_group'] = list(map(
    lambda x, y: x if y >= 0.1 else 'other',
    df.segment,
    df.share_of_revenue
))

In [37]:
df[df.segment_group == 'other'].share_of_revenue.sum()

0.5312050617653221

In [38]:
df[df.segment_group == 'other'].shape

(10, 11)

In [39]:
df.segment_group.nunique()

53

In [40]:
2**52 / 7000 / 3600 / 24 / 365

20401.17248029688

In [41]:
df.cs_contacts.sum()/df.users.sum()

0.0465683507962604

In [42]:
df['contacts_per_user'] = df.cs_contacts/df.users

In [43]:
df[df.contacts_per_user <= 0.045].shape

(23, 12)

In [44]:
df['segment_group'] = list(map(
    lambda x, y: x if y > 0.042 else 'good',
    df.segment,
    df.contacts_per_user
))

In [45]:
df.segment_group.nunique()

43

In [46]:
2**41 / 7000 / 3600 / 24 / 365

9.96151000014496

### Linear optimisations

In [47]:
! pip install pulp


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [48]:
df.head()

Unnamed: 0,country,channel,users,cs_contacts,marketing_spending,revenue,segment,revenue_per_spend,spend_cumulative,share_of_revenue,segment_group,contacts_per_user
27,Sweden,social networks,3088,108,1075828.32,6952261.44,Sweden - social networks,6.462241,1075828.32,3.383495,good,0.034974
3,Switzerland,social networks,2681,180,1781765.79,10498340.23,Switzerland - social networks,5.892099,2857594.11,5.109285,Switzerland - social networks,0.067139
18,Lithuania,social networks,730,46,106492.4,528038.2,Lithuania - social networks,4.958459,2964086.51,0.256983,Lithuania - social networks,0.063014
2,Bulgaria,social networks,2168,115,130578.64,626140.08,Bulgaria - social networks,4.795119,3094665.15,0.304727,Bulgaria - social networks,0.053044
29,Slovakia,social networks,1702,46,274056.04,1309586.88,Slovakia - social networks,4.778537,3368721.19,0.637344,good,0.027027


In [49]:
from pulp import *

In [50]:
# Create the problem instance
problem = LpProblem("Marketing_campaign", LpMaximize)

In [51]:
# Decision variables
items = range(df.shape[0])  # Number of items
selected = LpVariable.dicts("Selected", items, cat="Binary")

In [52]:
# Objective function: Maximize the total value
problem += lpSum(selected[i] * list(df['revenue'].values)[i] for i in items)

In [53]:
# Constraint: Total weight should not exceed the knapsack capacity
problem += lpSum(selected[i] * list(df['marketing_spending'].values)[i] for i in items) <= 30000000

In [54]:
problem

Marketing_campaign:
MAXIMIZE
6952261.44*Selected_0 + 10498340.23*Selected_1 + 364652.47*Selected_10 + 360358.8*Selected_11 + 2543016.96*Selected_12 + 294875.7*Selected_13 + 74854.08*Selected_14 + 1623908.0*Selected_15 + 169291.0*Selected_16 + 274979.65*Selected_17 + 369810.24*Selected_18 + 2597360.42*Selected_19 + 528038.2*Selected_2 + 3134893.86*Selected_20 + 338584.17*Selected_21 + 1601526.83*Selected_22 + 1709970.66*Selected_23 + 74415.18*Selected_24 + 2524372.62*Selected_25 + 323604.61*Selected_26 + 131537.18*Selected_27 + 220807.6*Selected_28 + 988428.48*Selected_29 + 626140.08*Selected_3 + 239873.24*Selected_30 + 1636203.64*Selected_31 + 19785841.52*Selected_32 + 38001.0*Selected_33 + 152977.56*Selected_34 + 9100510.86*Selected_35 + 229397.46*Selected_36 + 883976.8*Selected_37 + 1424592.89*Selected_38 + 202152.49*Selected_39 + 1309586.88*Selected_4 + 936300.36*Selected_40 + 8605419.9*Selected_41 + 7887038.4*Selected_42 + 2450700.7*Selected_43 + 123362.93*Selected_44 + 324029.88*S

In [55]:
%time 
problem.solve()

CPU times: user 4 μs, sys: 1 μs, total: 5 μs
Wall time: 7.87 μs
Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/marie/Documents/github/llm_env/lib/python3.11/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/7efa121da7e348b99e2af050e357d8a2-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/7efa121da7e348b99e2af050e357d8a2-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 6 COLUMNS
At line 255 RHS
At line 257 BOUNDS
At line 320 ENDATA
Problem MODEL has 1 rows, 62 columns and 62 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 1.10213e+08 - 0.00 seconds
Cgl0004I processed model has 1 rows, 62 columns (62 integer (62 of which binary)) and 62 elements
Cbc0038I Initial state - 1 integers unsatisfied sum - 0.126572
Cbc0038I Solu

1

In [60]:
problem

Marketing_campaign:
MAXIMIZE
6952261.44*Selected_0 + 10498340.23*Selected_1 + 364652.47*Selected_10 + 360358.8*Selected_11 + 2543016.96*Selected_12 + 294875.7*Selected_13 + 74854.08*Selected_14 + 1623908.0*Selected_15 + 169291.0*Selected_16 + 274979.65*Selected_17 + 369810.24*Selected_18 + 2597360.42*Selected_19 + 528038.2*Selected_2 + 3134893.86*Selected_20 + 338584.17*Selected_21 + 1601526.83*Selected_22 + 1709970.66*Selected_23 + 74415.18*Selected_24 + 2524372.62*Selected_25 + 323604.61*Selected_26 + 131537.18*Selected_27 + 220807.6*Selected_28 + 988428.48*Selected_29 + 626140.08*Selected_3 + 239873.24*Selected_30 + 1636203.64*Selected_31 + 19785841.52*Selected_32 + 38001.0*Selected_33 + 152977.56*Selected_34 + 9100510.86*Selected_35 + 229397.46*Selected_36 + 883976.8*Selected_37 + 1424592.89*Selected_38 + 202152.49*Selected_39 + 1309586.88*Selected_4 + 936300.36*Selected_40 + 8605419.9*Selected_41 + 7887038.4*Selected_42 + 2450700.7*Selected_43 + 123362.93*Selected_44 + 324029.88*S

In [61]:
selected[0].value()

1.0

In [62]:
df['selected'] = list(map(lambda x: x.value(), selected.values()))

In [63]:
df.selected.sum()

48.0

In [64]:
df[df.selected == 1].revenue.sum()/10**6

110.16266221000001

In [65]:
df[df.selected == 1].marketing_spending.sum()/10**6

29.99846894

In [66]:
df[df.selected == 1].cs_contacts.sum()/df[df.selected == 1].users.sum()

0.04463332965375134

In [67]:
problem = LpProblem("Marketing_campaign", LpMaximize)
items = range(df.shape[0])  # Number of items
selected = LpVariable.dicts("Selected", items, cat="Binary")
problem += lpSum(selected[i] * list(df['revenue'].values)[i] for i in items)
problem += lpSum(selected[i] * list(df['cs_contacts'].values)[i] for i in items) <= 5000
problem += lpSum(selected[i] * list(df['marketing_spending'].values)[i] for i in items) <= 30*10**6
problem += lpSum(selected[i] * list(df['cs_contacts'].values)[i] for i in items) <= 0.042*lpSum(selected[i] * list(df['users'].values)[i] for i in items)

In [68]:
problem.solve()

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/marie/Documents/github/llm_env/lib/python3.11/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/f9a42c7f7808473d908ba10279b055ac-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/f9a42c7f7808473d908ba10279b055ac-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 8 COLUMNS
At line 380 RHS
At line 384 BOUNDS
At line 447 ENDATA
Problem MODEL has 3 rows, 62 columns and 185 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 1.10164e+08 - 0.00 seconds
Cgl0004I processed model has 3 rows, 62 columns (62 integer (62 of which binary)) and 185 elements
Cbc0038I Initial state - 2 integers unsatisfied sum - 0.52483
Cbc0038I Pass   1: suminf.    0.52483 (2) obj. -1.10164e+08 iterations 0
Cbc

1

In [69]:
df['selected'] = list(map(lambda x: x.value(), selected.values()))

In [70]:
df.selected.sum()

42.0

In [71]:
df[df.selected == 1].revenue.sum()/10**6

109.86986429000002

In [72]:
df[df.selected == 1].marketing_spending.sum()/10**6

29.998137170000003

In [73]:
df[df.selected == 1].cs_contacts.sum()/df[df.selected == 1].users.sum()

0.03765019714289057

In [74]:
problem = LpProblem("Marketing_campaign", LpMaximize)
items = range(df.shape[0])  # Number of items
selected = LpVariable.dicts("Selected", items, cat="Binary")
problem += lpSum(selected[i] * list(df['revenue'].values)[i] for i in items)
problem += lpSum(selected[i] * list(df['marketing_spending'].values)[i] for i in items) <= 30*10**6
problem += lpSum(selected[i] for i in items) <= 10
problem.solve()

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/marie/Documents/github/llm_env/lib/python3.11/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/b64f1c30cdad4b019f3a1c0c847eb1f5-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/b64f1c30cdad4b019f3a1c0c847eb1f5-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 7 COLUMNS
At line 318 RHS
At line 321 BOUNDS
At line 384 ENDATA
Problem MODEL has 2 rows, 62 columns and 124 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 1.04561e+08 - 0.00 seconds
Cgl0004I processed model has 2 rows, 62 columns (62 integer (62 of which binary)) and 124 elements
Cbc0038I Initial state - 2 integers unsatisfied sum - 0.538397
Cbc0038I Pass   1: suminf.    0.24240 (1) obj. -1.04383e+08 iterations 2
Cb

1

In [75]:
df['selected'] = list(map(lambda x: x.value(), selected.values()))

In [76]:
df.selected.sum()

10.0

In [82]:
df['revenue'].sum()/df['users'].sum()

1008.3955672465829

In [83]:
# define the problem
problem_v4 = LpProblem("Marketing_campaign_v2", LpMaximize)

# decision variables
segments = range(df.shape[0]) 
selected = LpVariable.dicts("Selected", segments, cat="Binary")

# objective function
problem_v4 += 0.5*lpSum(selected[i] * list(df['revenue'].values)[i]/df.revenue.sum() for i in items) \
  + 0.5*lpSum(selected[i] * list(df['users'].values)[i]/df.users.sum() for i in items)

# constraints
problem_v4 += lpSum(selected[i] * list(df['marketing_spending'].values)[i] \
  for i in items) <= 30*10**6

# run the optimisation
problem_v4.solve()
df['selected'] = list(map(lambda x: x.value(), selected.values()))

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /Users/marie/Documents/github/llm_env/lib/python3.11/site-packages/pulp/solverdir/cbc/osx/64/cbc /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/db5b66c674ae48dba80971d200f6feb8-pulp.mps -max -timeMode elapsed -branch -printingOptions all -solution /var/folders/7v/1ln722x97kd8bchgxpmdkynw0000gn/T/db5b66c674ae48dba80971d200f6feb8-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 6 COLUMNS
At line 255 RHS
At line 257 BOUNDS
At line 320 ENDATA
Problem MODEL has 1 rows, 62 columns and 62 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0.613382 - 0.00 seconds
Cgl0004I processed model has 1 rows, 62 columns (62 integer (62 of which binary)) and 62 elements
Cbc0038I Initial state - 1 integers unsatisfied sum - 0.476239
Cbc0038I Pass   1: suminf.    0.03291 (1) obj. -0.612507 iterations 1
Cbc0038I S

In [84]:
df.selected.sum()

41.0

In [88]:
df[df.selected == 1].revenue.sum(), df[df.selected == 1].users.sum()

(104363711.15, 146374)