#### Tested with :
- Linux Ubuntu
- Python 3.5
- Cuda 8
- Conda package for PyTorch

In [1]:
from __future__ import print_function
import argparse

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable

In [3]:
import numpy as np

In [4]:
PREFER_CUDA = True

In [5]:
use_cuda = PREFER_CUDA and torch.cuda.is_available()

In [6]:
if not(use_cuda == PREFER_CUDA):
    print('CUDA SETUP NOT AS EXCEPTED')
else:
    print('OK')

OK


In [7]:
# args.seed

nb_features = 250 # the number of possible features
dim_embed = 50

### Model assumption
- We assume all model variables are binary 0/1 valued
- We represent the X input vector has the sparse coding of its "1" indices
- Indices start at 1 so that we can reserve 0 for padding in the embeddings

In [8]:
# a batch of 2 samples of 3 indices each
X = Variable(torch.LongTensor([[11,20,4],[30,10,20],]))

In [9]:
# Stores the bias term
B = Variable(torch.randn((1)).type(torch.FloatTensor), requires_grad=True)

# Stores the weights for the linear terms
embeddingL = nn.Embedding(nb_features, 1, padding_idx=None, max_norm=None, norm_type=2)

# Stores the weights for the quadratic FM terms
embeddingQ = nn.Embedding(nb_features, dim_embed, padding_idx=None, max_norm=None, norm_type=2)

### The linear part

In [10]:
eL = embeddingL(X)
eL.size()

torch.Size([2, 3, 1])

In [11]:
logitL = eL.sum(dim=1)
logitL.size()

torch.Size([2, 1, 1])

### The Quadratic-FM part using the O(kn) formulation from Steffen Rendle

In [12]:
eQ = embeddingQ(X)
eQ.size()

torch.Size([2, 3, 50])

In [13]:
logitFM1 = eQ.mul(eQ).sum(1).sum(2)
logitFM1.size()

torch.Size([2, 1, 1])

In [14]:
z = eQ.sum(dim=1)# sum across features
z.size()

torch.Size([2, 1, 50])

In [15]:
z2 = z.mul(z) # element-wise product
z2.size()

torch.Size([2, 1, 50])

In [16]:
logitFM2 = z2.sum(dim=2) # sum across embedding dimensions
logitFM2.size()

torch.Size([2, 1, 1])

In [17]:
logitFM = (logitFM1 - logitFM2)*0.5
logitFM.size()

torch.Size([2, 1, 1])

### Total logit and binary prediction

In [18]:
logit = (logitL + logitFM).squeeze(dim=-1).squeeze(dim=-1)

logit+= B.expand(1, logit.size()[0]).transpose(0,1)

logit#.size()

Variable containing:
 16.1722
 -3.5943
[torch.FloatTensor of size 2]

In [19]:
pred = F.sigmoid(logit)
pred#.size()

Variable containing:
 1.0000
 0.0267
[torch.FloatTensor of size 2]