In [1]:
import numpy as np
import pandas as pd
import json
import re
import torch
import opacus

# Plan for Ad Recommendation

- Logistic regression will take in a normalized vector in R41 (interests)
- Output a (normalized) vector in R39 (categories in the ad dataset)
- Train log reg using PyTorch with opacus privacy engine. Delta 1/6k (size of dataset)
- Trained by sending 41C1, 41C2, 41C3 combinations of interests and their corresponding ad categories (all normalized). (We will  construct this mapping manually — i.e map each interest to ad categories)
- Hopefully this shld generalize well to larger combinations of interests
- Idea for prediction: each user has an interests vector; initially evenly divided
- Run a forward pass of log reg on interests vector, we get a normalized categories vector
- We'll then sample from this normalized vector as a probability distribution over categories. Then select at random an image with this category and display it
- If user clicks on ad, bump corresponding interests vector entries by epsilon. 
When to retrain log reg? Have an ad click diff?


In [2]:
with open("annotations_images/image/Topics_List.txt", encoding="utf8", errors="ignore") as f:
    l= f.read().replace('\x00','')
    mapping = {}
    for item in l.split("\n"):
        if item:
            idx, content = item.split("\t")
            mapping[idx] = re.sub(r'\([^)]*\)', "", content)
mapping

{'1': '"Restaurants, cafe, fast food" ',
 '2': '"Chocolate, cookies, candy, ice cream" ',
 '3': '"Chips, snacks, nuts, fruit, gum, cereal, yogurt, soups" ',
 '4': '"Seasoning, condiments, ketchup" ',
 '5': '"Pet food" ',
 '6': '"Alcohol" ',
 '7': '"Coffee, tea" ',
 '8': '"Soda, juice, milk, energy drinks, water" ',
 '9': '"Cars, automobiles " ',
 '10': '"Electronics " ',
 '11': '"Phone, TV and internet service providers" ',
 '12': '"Financial services " ',
 '13': '"Education " ',
 '14': '"Security and safety services " ',
 '15': '"Software " ',
 '16': '"Other services " ',
 '17': '"Beauty products and cosmetics " ',
 '18': '"Healthcare and medications " ',
 '19': '"Clothing and accessories " ',
 '20': '"Baby products " ',
 '21': '"Games and toys " ',
 '22': '"Cleaning products " ',
 '23': '"Home improvements and repairs " ',
 '24': '"Home appliances " ',
 '25': '"Vacation and travel " ',
 '26': '"Media and arts " ',
 '27': '"Sports equipment and activities" ',
 '28': '"Shopping " ',
 '

In [3]:
# This is handmade
topic_mapping = {"1": [8, 9, 15, 24], "2": [8, 9, 24], 
                 "3": [8, 9, 24], "4": [], "5": [], "6": [8], "7": [8, 9, 24], 
                 "8": [8, 9, 24], "9":[11, 16, 27], "10": [21, 34, 11], "11":[21, 34, 11], 
                 "12": [28, 29, 13, 10], "13": [31, 40, 4, 5], "14": [34, 11],
                 "15": [34, 11], "16": [28, 39, 32, 31], "17": [5, 13, 18, 36, 37],
                 "18": [15, 11, 36], "19": [13, 14, 36], "20": [15,37, 23], "21": [15, 39, 25, 23],
                 "22": [15], "23": [15], "24": [15, 34], "25": [16, 18, 24, 38, 32], "26": [5, 39],
                 "27": [27], "28":[28, 15, 14, 13, 39], "29": [29], "30": [38, 16], "31": [31, 32], 
                 "32": [31], "33": [], "34": [24, 17], "35": [33], "36": [33], "37": [22], "38": [29, 28, 4], 
                 "39": [3, 19, 20, 30, 31, 32, 33, 35]
                }

In [27]:
pd.read_csv("results.csv").head()

Unnamed: 0,category
0,religion
1,divorce
2,parents
3,black voices
4,good news


In [5]:
inv_map = {}
for k,v in topic_mapping.items():
    for x in v:
        inv_map.setdefault(x,[]).append(k)

In [6]:
import itertools
keys = list(inv_map.keys())

In [7]:
combo1 = list(itertools.combinations(keys, 1))
combo2 = list(itertools.combinations(keys, 2))
combo3 = list(itertools.combinations(keys, 3))

In [8]:
X_train = []
Y_train = []

In [9]:
for combo in [combo1, combo2, combo3]:
    for item in combo:
        x_arr = np.zeros(41)
        y_arr = np.zeros(40)

        for idx in item:
            x_arr[idx] = 1
            for v in inv_map[idx]:
                y_arr[int(v)] += 1
        X_train.append(x_arr / np.linalg.norm(x_arr, ord=1))
        Y_train.append(y_arr / np.linalg.norm(y_arr, ord=1))

In [10]:
X = np.array(X_train).astype(np.float32)
Y = np.array(Y_train).astype(np.float32)

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1)

In [13]:
import torch.nn as nn
import torch.optim as optim
from opacus import PrivacyEngine
from torchvision import datasets, transforms

In [14]:
class AdModel(torch.nn.Module):
    def __init__(self):
        super(AdModel, self).__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(41, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 64),
            torch.nn.ReLU(),
            torch.nn.Linear(64, 40),
            torch.nn.ReLU(),
        )

    def forward(self, x):
        return self.layers(x)

In [21]:
def train(model, train_loader, optimizer, privacy_engine, epoch):
    model.train()
    criterion = nn.HuberLoss()
    losses = []
    for _batch_idx, (data, target) in enumerate(tqdm(train_loader)):
        data, target = data, target
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

   
    epsilon, best_alpha = privacy_engine.accountant.get_privacy_spent(
        delta=1e-5
    )
    print(f"Train Epoch: {epoch} \t"
        f"Loss: {np.mean(losses):.6f} "
        f"(ε = {epsilon:.2f}, δ = {1/6000}) for α = {best_alpha}")
    return
   


def test(model,test_loader):
    model.eval()
    criterion = nn.HuberLoss()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in tqdm(test_loader):
            data, target = data, target
            output = model(data)
            test_loss += criterion(output, target).item()  # sum up batch loss

    test_loss /= len(test_loader.dataset)

    print(f"\nTest set: Average loss: {test_loss}")

    return correct / len(test_loader.dataset)

In [22]:
from torch.utils.data import Dataset
class AdDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.Y[idx]

training_data = AdDataset(X_train, y_train)
testing_data = AdDataset(X_test, y_test)

In [23]:
train_loader = torch.utils.data.DataLoader(training_data,batch_size=128,pin_memory=True)
test_loader = torch.utils.data.DataLoader(testing_data,batch_size=128, shuffle=True,pin_memory=True)

In [24]:
from tqdm import tqdm
import torch.nn.functional as F

In [25]:
run_results = []
model = AdModel()

optimizer = optim.SGD(model.parameters(), lr=0.05, momentum=0.5)
privacy_engine = PrivacyEngine(secure_mode=False)
model, optimizer, train_loader = privacy_engine.make_private(
    module=model,
    optimizer=optimizer,
    data_loader=train_loader,
    noise_multiplier=1.0,
    max_grad_norm=1.0,
)

for epoch in range(1, 30):
    train(model, train_loader, optimizer, privacy_engine, epoch)
run_results.append(test(model,test_loader))

if len(run_results) > 1:
    print(
        "Accuracy averaged over {} runs: {:.2f}% ± {:.2f}%".format(
            len(run_results), np.mean(run_results) * 100, np.std(run_results) * 100
        )
    )

100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 220.69it/s]


Train Epoch: 1 	Loss: 0.002370 (ε = 1.65, δ = 0.00016666666666666666) for α = 7.3


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 239.16it/s]


Train Epoch: 2 	Loss: 0.002249 (ε = 1.90, δ = 0.00016666666666666666) for α = 7.0


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 242.66it/s]


Train Epoch: 3 	Loss: 0.002182 (ε = 2.11, δ = 0.00016666666666666666) for α = 6.7


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 241.86it/s]


Train Epoch: 4 	Loss: 0.002097 (ε = 2.31, δ = 0.00016666666666666666) for α = 6.6


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 246.54it/s]


Train Epoch: 5 	Loss: 0.002009 (ε = 2.49, δ = 0.00016666666666666666) for α = 6.4


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 247.29it/s]


Train Epoch: 6 	Loss: 0.001974 (ε = 2.66, δ = 0.00016666666666666666) for α = 6.3


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.25it/s]


Train Epoch: 7 	Loss: 0.001938 (ε = 2.82, δ = 0.00016666666666666666) for α = 6.1


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 246.11it/s]


Train Epoch: 8 	Loss: 0.001896 (ε = 2.97, δ = 0.00016666666666666666) for α = 6.0


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 245.14it/s]


Train Epoch: 9 	Loss: 0.001910 (ε = 3.12, δ = 0.00016666666666666666) for α = 5.9


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 241.83it/s]


Train Epoch: 10 	Loss: 0.001896 (ε = 3.26, δ = 0.00016666666666666666) for α = 5.8


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 244.58it/s]


Train Epoch: 11 	Loss: 0.001883 (ε = 3.40, δ = 0.00016666666666666666) for α = 5.7


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.22it/s]


Train Epoch: 12 	Loss: 0.001898 (ε = 3.53, δ = 0.00016666666666666666) for α = 5.6


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 247.66it/s]


Train Epoch: 13 	Loss: 0.001877 (ε = 3.66, δ = 0.00016666666666666666) for α = 5.5


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 241.85it/s]


Train Epoch: 14 	Loss: 0.001895 (ε = 3.79, δ = 0.00016666666666666666) for α = 5.4


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 242.33it/s]


Train Epoch: 15 	Loss: 0.001876 (ε = 3.91, δ = 0.00016666666666666666) for α = 5.3


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.58it/s]


Train Epoch: 16 	Loss: 0.001878 (ε = 4.03, δ = 0.00016666666666666666) for α = 5.2


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 248.01it/s]


Train Epoch: 17 	Loss: 0.001891 (ε = 4.15, δ = 0.00016666666666666666) for α = 5.1


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.29it/s]


Train Epoch: 18 	Loss: 0.001852 (ε = 4.26, δ = 0.00016666666666666666) for α = 5.1


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 247.83it/s]


Train Epoch: 19 	Loss: 0.001859 (ε = 4.38, δ = 0.00016666666666666666) for α = 5.0


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 248.34it/s]


Train Epoch: 20 	Loss: 0.001859 (ε = 4.49, δ = 0.00016666666666666666) for α = 4.9


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 247.38it/s]


Train Epoch: 21 	Loss: 0.001869 (ε = 4.59, δ = 0.00016666666666666666) for α = 4.9


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 242.20it/s]


Train Epoch: 22 	Loss: 0.001846 (ε = 4.70, δ = 0.00016666666666666666) for α = 4.8


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.62it/s]


Train Epoch: 23 	Loss: 0.001861 (ε = 4.81, δ = 0.00016666666666666666) for α = 4.7


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 243.71it/s]


Train Epoch: 24 	Loss: 0.001859 (ε = 4.91, δ = 0.00016666666666666666) for α = 4.7


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 245.32it/s]


Train Epoch: 25 	Loss: 0.001865 (ε = 5.01, δ = 0.00016666666666666666) for α = 4.6


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 248.44it/s]


Train Epoch: 26 	Loss: 0.001857 (ε = 5.11, δ = 0.00016666666666666666) for α = 4.6


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 247.01it/s]


Train Epoch: 27 	Loss: 0.001828 (ε = 5.21, δ = 0.00016666666666666666) for α = 4.5


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 248.60it/s]


Train Epoch: 28 	Loss: 0.001880 (ε = 5.30, δ = 0.00016666666666666666) for α = 4.5


100%|██████████████████████████████████████████| 47/47 [00:00<00:00, 250.20it/s]


Train Epoch: 29 	Loss: 0.001856 (ε = 5.40, δ = 0.00016666666666666666) for α = 4.5


100%|███████████████████████████████████████████| 6/6 [00:00<00:00, 1023.75it/s]


Test set: Average loss: 1.670921923405856e-05





In [26]:
# Example input:

v = np.zeros(41).astype('float32')
v[36] = 1
v[37] = 1
model.eval()
op = model(torch.from_numpy(v)).detach().numpy()

# Remove those with negative or low likelihood

op[op < 0.05] = 0

# Normalize to give probability distribution

eva = op / np.linalg.norm(op, ord=1)

[(i, v) for i, v in enumerate(eva) if v > 0]

# We have a distribution over ad categories now: we will sample an image in the prediction service using this

[(1, 0.22584109),
 (9, 0.20723513),
 (11, 0.16781266),
 (20, 0.22721857),
 (25, 0.17189254)]