In [1]:
import pickle
import numpy as np
import time
from os.path import abspath

In [2]:
labels = ['pos', 'neg', 'neutral']
pickle_in = open("hashtag.pickle","rb")
[nodes, PI, SI] = pickle.load(pickle_in)
num_nodes = len(nodes)
print("Number of nodes = {}".format(num_nodes))

Number of nodes = 1055


In [3]:
class Timer:    
    """ Timer """
    def __enter__(self):
        self.start = time.clock() # start
        return self

    def __exit__(self, *args):
        self.end = time.clock() # end
        self.i = self.end - self.start # time taken

In [4]:
def get_psi(SI):
    PSI = np.zeros((num_nodes, num_nodes))
    for i in range(num_nodes):
        for j in range(num_nodes):
            try:
                PSI[i][j] = SI[nodes[i], nodes[j]]
                PSI[j][i] = SI[nodes[i], nodes[j]]
            except:
                try:
                    PSI[i][j] = SI[nodes[j], nodes[i]]
                    PSI[j][i] = SI[nodes[j], nodes[i]]
                except:
                    None
    return PSI

In [5]:
def get_phi(PI):
    PHI = np.zeros((num_nodes, len(labels)))
    for node in range(num_nodes):
        for label in range(len(labels)):
            try:
                PHI[node][label] = PI[labels[label], nodes[node]]
            except:
                PHI[node][label] = 0.0
    return PHI   

In [6]:
def getMessage(i, PHIi, PSIi, label_j, labels, neighbours, messages):
    m = { label: np.zeros(messages[label].shape[0]) for label in labels } # initialise
    zeros = np.zeros(len(PSIi))

    for neighbour in neighbours:
        S = set(neighbours) - {neighbour}
        for label in labels: # compute contribution to each neighbour from all other neighbours of i
            m[label][neighbour] = np.prod([ messages[label][k,i] for k in S ]) + .00000001

    return sum( np.multiply(PHIi[i] * (PSIi if label_i == label_j else zeros), m[label_i]) for i, label_i in enumerate(labels))

In [7]:
def getScores(i, PHIi, labels, neighbours, messages):
    scores = { label: PHIi[label_i] * np.prod([ messages[label][j,i] for j in neighbours ]) for label_i, label in enumerate(labels) }
    # normalise scores
    alpha = sum( scores[label] for label in labels )
    if alpha:
        for label in labels:
            scores[label] *= 1 / alpha

    return scores

In [8]:
def getLabels(scores):
    return [ max(score, key=score.get) for score in scores ]

In [9]:
def LBP(labels, nodes, PI, SI):
    with Timer() as t: # do some extra initialisation
        PSI = get_psi(SI) # Convert the SI value to matrix representation
        PHI = get_phi(PI) # Convert sentiment probablity to matrix representation
        messages = { label: (PSI > 0).astype(float) for label in labels } # Initialize the message values to 1 for co-occuring nodes 
        neighbours = [ list(np.nonzero(PSI[i])[0]) for i in range(num_nodes) ] # extract neighbours for all nodes
        print('PSI = {}, PHI = {}'.format(PHI.shape, PSI.shape))
    print("\nStarting propagation on {} node hashtag graph".format(num_nodes))

    loops = 0 # loop variable t
    while True:
        with Timer() as t: # compute messages
            loops += 1
            old_messages = { label: messages[label].copy() for label in labels } # archive messages
#             print('1')
            for i in range(num_nodes): # for each node
                for label in labels: # compute message from node i to its neighbours
                    messages[label][i,:] = getMessage(i, PHI[i,:].flatten(), PSI[i,:].flatten(), label, labels, neighbours[i], old_messages)
#                 print('2')
                alpha = [ alpha for alpha in map(lambda x : 1/x if x else 1, np.sum([ messages[label][i,:] for label in labels ], axis=0)) ] # compute normaliser
                for label in labels:
                    messages[label][i,:] *= alpha # normalise messages
#         print('3')
        print("[{:.3f}s] Loop {} completed.\tMessage change: {}".format(t.i, loops, np.sum([ abs(old_messages[label] - messages[label]) for label in labels ])))

        if np.product([ np.allclose(old_messages[label], messages[label]) for label in labels ]) or loops > 10: # halt if messages stop changing
            break

    print("Propagation complete.\n")
    
    with Timer() as t: # compute final labels
        scores = [ getScores(i, PHI[i,:], labels, neighbours[i], messages) for i in range(num_nodes)]
        results = getLabels(scores)

    print("[{:.3f}s] Final labels computed. Objective value: {}".format(t.i, sum( max(score.values()) for score in scores )))

    return results, scores

In [10]:
def printResults(labels, nodes, results, scores):
    filename = 'LBP_ouput.results'
    with open(filename, 'w') as fo:
        fo.write("ID\tNode\tLabel\t{}\n".format('\t'.join(labels)))
        for i in range(len(nodes)):
            fo.write("{}\t{}\t{}\t{}\n".format(i, nodes[i], results[i], '\t'.join(str(scores[i][label]) for label in labels)))
            
    print("Results written to '{}'.".format(abspath(filename)))

In [11]:
results, scores = LBP(labels, nodes, PI, SI) # run propagation
printResults(labels, nodes, results, scores) # print results

  after removing the cwd from sys.path.


PSI = (1055, 3), PHI = (1055, 1055)

Starting propagation on 1055 node hashtag graph


  


[208.611s] Loop 1 completed.	Message change: 459308.00000000023
[215.800s] Loop 2 completed.	Message change: 74.64686523136525
[259.132s] Loop 3 completed.	Message change: 3.3754105792103863
[219.430s] Loop 4 completed.	Message change: 0.0018554937039891636
[220.667s] Loop 5 completed.	Message change: 5.317366202033022e-05
Propagation complete.

[0.438s] Final labels computed. Objective value: 1000.6665139425836
Results written to '/home/hadoop/Documents/BD_Project/MeTooAnalysis-master/twitter/ouput.results'.
