PBG training (see the project in: https://github.com/facebookresearch/PyTorch-BigGraph)

1. In "~/PyTorch-BigGraph/torchbiggraph/examples/configs/" , create the config file 'italy_config.py':

def get_torchbiggraph_config():

    config = dict(
        # I/O data
        entity_path="data/italy",
        edge_paths=["data/italy/italy"],
        checkpoint_path="model/italy",

        # Graph structure
        entities={
            'user_id': {'num_partitions': 1},
        },
        relations=[{
            'name': 'follow',
            'lhs': 'user_id',
            'rhs': 'user_id',
            'operator': 'none',
        }],

        # Scoring model
        dimension=100,
        global_emb=False,

        # Training
        num_epochs=30,
        lr=0.001,

        # Misc
        hogwild_delay=2,
    )


2. In ~/PyTorch-BigGraph/, do the format for the edge file (e.g., ids.edges) located in ~/PyTorch-BigGraph/data/italy/. The file is a row file of edges, and each edge is a pair <uids,uidd>:

$ torchbiggraph_import_from_tsv --lhs-col=0 --rhs-col=1 torchbiggraph/examples/configs/italy_config.py data/italy/ids.edges

3. In "~/PyTorch-BigGraph/", do the training step:

$ torchbiggraph_train torchbiggraph/examples/configs/italy_config.py -p edge_paths=data/italia/italy

4. In "~/PyTorch-BigGraph/", export the embeddings to a file:

$ torchbiggraph_export_to_tsv --entities-output italy_embeddings.tsv --relation-types-output rel.tsv torchbiggraph/examples/configs/italy_config.py

 
 - replace \t per singles space in italy_embeddings.tsv
 - insert a line in the header witn num_rows num_cols

Libraries

In [1]:
import csv
import numpy as np
import time
import mrpt
import math
from random import sample
from gensim.models.keyedvectors import KeyedVectors

Input reading (embeddings and ground truth)

- labeled_accounts.csv: <user_id,screen_name,label>
- usernames: <user_id,screen_name>

In [31]:
vectors_file_vec = 'encodings/italy_embeddings.tsv' # replace by interactions_italy.tsv if we have interactions data

cantidad = 195863
vectors = KeyedVectors.load_word2vec_format(vectors_file_vec, limit=cantidad)

with open('bot_detection/Italy/labeled_accounts.csv', mode='r') as infile:
    reader = csv.reader(infile)
    ground_truth = {rows[1]:rows[2] for rows in reader}

with open('bot_detection/Italy/usernames', mode='r') as infile:
    reader = csv.reader(infile, delimiter= ' ')
    usernames = {rows[1]:rows[0] for rows in reader}

with open('bot_detection/Italy/usernames', mode='r') as infile:
    reader = csv.reader(infile, delimiter= ' ')
    userids = {rows[0]:rows[1] for rows in reader}

gt = dict()
for u in ground_truth.keys():
    if u in usernames.keys():
        gt[u] = ground_truth[u]


Indexing

In [32]:
n, d, k = cantidad, 100, 30
target_recall = 0.95

data = np.random.rand(n, d).astype(np.float32)

i=0
v_i = dict()
i_v = dict()
for v in vectors.vocab:
        data[i,:] = vectors[v]
        v_i[v] = i
        i_v[i] = v
        i += 1

index = mrpt.MRPTIndex(data)

index.build_autotune_sample(target_recall, k)


80/20 held-out partition for label propagation (1 fold)

In [33]:
ints = list(range(0,len(gt)-1))
sampling = sample(ints,x) #80%-x instances

val = dict()
test = dict()
i = 0
for u in gt:
    if i in sampling:
        test[u] = gt[u]
    else:
        val[u] = gt[u]
    i += 1


Bot detector (label propagation + performance measures)

In [34]:
cutoff = 0.65
minConf = 0.55
lr = 0.05 

labeled = set()
netsize = len(vectors.vocab)
file = open('bot_sensitivity/report_bot_detection_'+str(cutoff)+'_'+str(minConf)+'.txt',"w") # write mode 

start = time.time()
x = []
y = []
labels = dict()
confidences = dict()
Totimpbot = 0
Totimphum = 0
totbots = 0
tothums = 0

gtr = 0
for uid in vectors.vocab:
        u = userids[str(uid)]
        if (u in val and uid not in labeled): 
            labeled.add(uid)
            ulab = val[u]
            labels[u] = ulab
            confidences[u] = 1
            if ulab == 'bot':
                totbots += 1
            else:
                tothums += 1
            gtr += 1

if minConf >= 0.8:
    minSupp = 10
elif minConf >= 0.6:
    minSupp = 9
elif minConf >= 0.4:
    minSupp = 8
elif minConf >= 0.2:
    minSupp = 7
else:
    minSupp = 6

print('Cutoff:', str(cutoff),', MinConf:', str(minConf))    

cutoff -= 0.01
threshold = 1.0 # initial value
file.write("<--- First stage ---> \n")
while (threshold > cutoff): # first stage of bot detection
    file.write("<--- Threshold at "+str(threshold)+" ---> \n")
    impbot = 0
    imphum = 0
    for uid in vectors.vocab:
        u = userids[str(uid)]
        if uid not in labeled:
            knn = set()
            sims = dict()
            q = vectors[uid]
            li = index.ann(q)
            for i in li:
                vid = i_v[i]
                if vid in vectors.vocab:
                    dis = vectors.distance(uid,vid)
                    sim = 1 - dis
                else:
                    dis = 1
                    sim = 0
                v = userids[vid]
                if (dis < threshold and vid != uid):
                    knn.add(v)
                    sims[v] = sim
            numHums = 0
            numBots = 0
            accSim = 0
            confb = 0
            confh = 0
            for v in knn:
                if v in labels:
                    ilab = labels[v]
                    conf = confidences[v]
                    sim = sims[v]
                    accSim += sim
                    if (ilab == 'bot'): 
                        numBots += 1
                        confb += sim*conf
                    elif (ilab == 'human'): 
                        numHums += 1
                        confh += sim*conf
            if (numBots > numHums and numBots > minSupp):
                conf = (confb-confh)/accSim
                if conf > minConf:
                    labeled.add(uid)
                    labels[u] = 'bot'
                    confidences[u] = conf
                    file.write('bot detected, uid: '+ str(uid) +', numBots: '+str(numBots)+', numHums: '+str(numHums)+', conf: %.3f' % conf+'\n')
                    impbot += 1
                    Totimpbot += 1
                    totbots += 1
            
    print('ImpBot at %5.2f' % threshold, ' - Bots: ', impbot, ' - Humans: ', imphum)
    file.write('ImpBot: '+str(impbot)+' ImpHum: '+str(imphum)+'\n')
    threshold -= lr
    if threshold < 0.7:
        minSupp -= 0.5
    else:
        minSupp -= 1

        
threshold = 0.25 # initial value for second stage
file.write("<--- Second stage ---> \n")
while (len(labeled) < netsize and threshold <= 1.0): 
    file.write("<--- Threshold at "+str(threshold)+" ---> \n")
    gtr = 0
    impbot = 0
    imphum = 0
    for uid in vectors.vocab:
        u = userids[str(uid)]
        if (u in val and uid not in labeled): 
            labeled.add(uid)
            ulab = val[u]
            labels[u] = ulab
            confidences[u] = 1
            gtr += 1
        if uid not in labeled:
            knn = set()
            sims = dict()
            q = vectors[uid]
            li = index.ann(q)
            for i in li:
                vid = i_v[i]
                if vid in vectors.vocab:
                    dis = vectors.distance(uid,vid)
                    sim = 1 - dis
                else:
                    dis = 1
                    sim = 0
                v = userids[vid]
                if (dis < threshold and vid != uid):
                    knn.add(v)
                    sims[v] = sim
            numHums = 0
            numBots = 0
            accSim = 0
            confb = 0
            confh = 0
            for v in knn:
                if v in labels:
                    ilab = labels[v]
                    conf = confidences[v]
                    sim = sims[v]
                    accSim += sim
                    if (ilab == 'bot'): 
                        numBots += 1
                        confb += sim*conf
                    elif (ilab == 'human'): 
                        numHums += 1
                        confh += sim*conf
            if (numBots > numHums and numBots > minSupp):
                conf = (confb-confh)/accSim
                if conf > minConf and threshold >= cutoff:
                    labeled.add(uid)
                    labels[u] = 'bot'
                    confidences[u] = conf
                    file.write('bot detected, uid: '+ str(uid) +', numBots: '+str(numBots)+', numHums: '+str(numHums)+', conf: %.3f' % conf+'\n')
                    impbot += 1
                    Totimpbot += 1
                    totbots += 1
            elif (numHums > numBots and numHums > 2):
                conf = (confh-confb)/accSim
                if conf > minConf:
                    labeled.add(uid)
                    labels[u] = 'human'
                    confidences[u] = conf
                    imphum += 1
                    Totimphum += 1
                    tothums += 1
    
    print('ImpBot at %5.2f' % threshold, ' - Bots: ', impbot, ' - Humans: ', imphum)
    file.write('ImpBot: '+str(impbot)+' ImpHum: '+str(imphum)+'\n')
    threshold += lr
    if threshold < 0.7 and threshold >= cutoff:
        minSupp += 0.5
    elif threshold >= 0.7:
        minSupp += 1

end = time.time()

totimps = Totimpbot + Totimphum
botrate = Totimpbot/totimps

if botrate > 0.2: botrate = 0
    
file.write('Totals; ImpBot: '+ str(Totimpbot) + ', ImpHum: ' + str(Totimphum)+'\n')
file.write('Bot-rate: %.3f' % botrate +'\n')
file.write("Running time: "+ str(end - start)+'\n')
print('Totals; ImpBot: ', Totimpbot, ', ImpHum: ' , Totimphum)



if botrate > 0.2: botrate = 0

bb = 0
bh = 0
hb = 0
hh = 0
for u in test:
        nom = test[u]
        if u in labels:
            pre = labels[u]
            if (pre == 'bot' and nom == 'bot'): 
                bb += 1
            if (pre == 'bot' and nom == 'human'): 
                bh += 1
            if (pre == 'human' and nom == 'bot'): 
                hb += 1
            if (pre == 'human' and nom == 'human'): 
                hh += 1


tp = bb
fn = hb
fp = bh
tn = hh

TPR = tp/(tp+fn)
TNR = tn/(tn+fp)
PPV = tp/(tp+fp)
NPV = tn/(tn+fn)
ACC = (tp+tn)/(tp+tn+fp+fn)
F1 = (2*tp)/(2*tp+fn+fp)
MCC = (tp*tn-fp*fn)/math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))

file.write('Cutoff: ' + str(cutoff) + ', MinConf: ' + str(minConf) + '\n') 
file.write('TPR: %.3f' % TPR + ', TNR: %.3f' % TNR + ', PPV: %.3f' % PPV + ', NPV: %.3f' % NPV + '\n')
file.write('ACC: %.3f' % ACC + ', F1: %.3f' % F1 + ', MCC: %.3f' % MCC + '\n')
file.close()
print('ACC: %.3f' % ACC,', F1: %.3f' % F1,', MCC: %.3f' % MCC)

Cutoff: 0.65 , MinConf: 0.55
ImpBot at  1.00  - Bots:  96  - Humans:  0
ImpBot at  0.95  - Bots:  17  - Humans:  0
ImpBot at  0.90  - Bots:  8  - Humans:  0
ImpBot at  0.85  - Bots:  15  - Humans:  0
ImpBot at  0.80  - Bots:  19  - Humans:  0
ImpBot at  0.75  - Bots:  212  - Humans:  0
ImpBot at  0.70  - Bots:  2311  - Humans:  0
ImpBot at  0.65  - Bots:  8895  - Humans:  0
ImpBot at  0.25  - Bots:  0  - Humans:  0
ImpBot at  0.30  - Bots:  0  - Humans:  0
ImpBot at  0.35  - Bots:  0  - Humans:  4
ImpBot at  0.40  - Bots:  0  - Humans:  151
ImpBot at  0.45  - Bots:  0  - Humans:  1690
ImpBot at  0.50  - Bots:  0  - Humans:  8842
ImpBot at  0.55  - Bots:  0  - Humans:  19661
ImpBot at  0.60  - Bots:  0  - Humans:  25731
ImpBot at  0.65  - Bots:  3673  - Humans:  22439
ImpBot at  0.70  - Bots:  2820  - Humans:  14730
ImpBot at  0.75  - Bots:  1141  - Humans:  30484
ImpBot at  0.80  - Bots:  726  - Humans:  37573
ImpBot at  0.85  - Bots:  474  - Humans:  4580
ImpBot at  0.90  - Bots:  31 