In [None]:
import numpy as np
import networkx as nx
import pickle
import community
from operator import itemgetter
from scipy import integrate
from matplotlib import pyplot as plt
%matplotlib inline
import copy
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report

## [abo | gun | blm]
campaign = 'blm'
## [followers | friends]
connection_type = 'followers'
## [2018 | 2020]
year = 2018


## Read the connection list
connection_list = pickle.load(open("data/social_media/{}/ea_{}_{}.pkl".format(campaign, connection_type, str(year)), 'rb'))
print("{} list read!".format(connection_type))

## Read the connection graph
filtered_graph_file_path = 'data/social_media/{}/graph_edges/ea_filtered_graph_edge_list_{}.gpickle'.format(campaign, str(year))
G = nx.read_gpickle(filtered_graph_file_path)
print('Filtered G read!')
print("# of nodes in G:", G.number_of_nodes())
print("# of edges in G:", G.number_of_edges())

'''
## Add the nodes not having any common followers to the graph.
diff_users = list(set(connection_list.keys()).difference(set(G.nodes)))
print("# of diff users:", len(diff_users))
for user in diff_users:
    G.add_node(user)

print("# of nodes in G after addition:", G.number_of_nodes())
print("# of edges in G after addition:", G.number_of_edges())
'''

G.remove_nodes_from(list(nx.isolates(G)))
print("# of nodes in G after removal of isolates:", G.number_of_nodes())
print("# of edges in G after removal of isolates:", G.number_of_edges())


In [None]:
node_list = list(G.nodes)
print("len(node_list):", len(node_list))

users_leanings = pickle.load(open('data/from_anu/v1/{}_{}_users_leanings_labels.pkl'.format(campaign, ea_type), 'rb'))
labeled_users = [user_id for user_id in users_leanings if users_leanings[user_id] != -1 and user_id in node_list]
unlabeled_users = [user_id for user_id in users_leanings if users_leanings[user_id] == -1 and user_id in node_list]
all_users = labeled_users + unlabeled_users
print('len(all_users):', len(all_users))

labeled_labels = [users_leanings[user_id] for user_id in labeled_users]
print('len(labeled_users):', len(labeled_users))
print('len(labeled_labels):', len(labeled_labels))


## First apply 10-fold CV to see the label propagation works. 

In [None]:
pred_results = []
true_labels = []
skf = StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
for train_index, test_index in skf.split(labeled_users, labeled_labels):
    fold_train_labeled_users = list(itemgetter(*train_index)(labeled_users))
    aa = list(itemgetter(*list(train_index))(labeled_users))
    fold_train_labeled_labels = list(itemgetter(*train_index)(labeled_labels))
    fold_test_labeled_users = list(itemgetter(*test_index)(labeled_users))
    fold_test_labeled_labels = list(itemgetter(*test_index)(labeled_labels))
        
    fold_all_users = fold_train_labeled_users + fold_test_labeled_users + unlabeled_users
    fold_init_labels = np.zeros(shape=(len(all_users), 2))
    print(len(fold_all_users), len(all_users))
    
    for i in range(len(fold_train_labeled_users)):
        fold_init_labels[i][users_leanings[fold_train_labeled_users[i]]] = 1.
    
    W_fold = nx.convert_matrix.to_numpy_matrix(G, nodelist=fold_all_users)
    print(W_fold.shape, np.sum(W_fold))
    
    ## Apply label propagation by paulmorio
    alpha = 0.85
    mu = (1/alpha) - 1
    beta = mu/(1+mu)
    D_fold = (np.power(np.sum(W_fold, 1), -0.5))
    D_fold = np.squeeze(np.asarray(D_fold))
    D_fold = np.diag(D_fold)
    S_fold = D_fold*W_fold*D_fold

    #predictions_score_fold = copy.deepcopy(fold_init_labels)
    
    ## Instead of propagation without regularization
    #predictions_score_fold = (np.linalg.inv(np.identity(W_fold.shape[0]) - alpha*S_fold)) * predictions_score_fold
    #predictions_score_fold = (np.linalg.inv(np.identity(W_fold.shape[0]) - alpha*S_fold)) * fold_init_labels
    
    ## Instead of propagation with regularization
    #predictions_score_fold = beta*(np.linalg.inv(np.identity(W_fold.shape[0]) - alpha*S_fold)) * predictions_score_fold
    predictions_score_fold = beta*(np.linalg.inv(np.identity(W_fold.shape[0]) - alpha*S_fold)) * fold_init_labels
    
    for i in range(len(fold_train_labeled_users), len(labeled_labels)):
        pred_results.append(np.argmax(predictions_score_fold[i]))
    
    for i in range(len(fold_test_labeled_labels)):
        true_labels.append(fold_test_labeled_labels[i])


left = 0
right = 0
counter = 0
for i in range(len(pred_results)):
    if pred_results[i] == true_labels[i]:
        if pred_results[i] == 0 and true_labels[i] == 0:
            left += 1
        elif pred_results[i] == 1 and true_labels[i] == 1:
            right += 1
        counter += 1

print('acc:', float(counter)/len(true_labels))
print('left acc:', float(left)/len([a for a in labeled_labels if a == 0]))
print('right acc:', float(right)/len([a for a in labeled_labels if a == 1]))
print(len([a for a in labeled_labels if a == 0]))
print(len([a for a in labeled_labels if a == 1]))

print(classification_report(true_labels, pred_results))


## Second, apply Label propagation on unknown users to assign labels and probs. 

In [None]:
init_labels = np.zeros(shape=(len(all_users), 2))
for i in range(len(labeled_users)):
    init_labels[i][users_leanings[labeled_users[i]]] = 1. 

W = nx.convert_matrix.to_numpy_matrix(G, nodelist=all_users)
print(W.shape, np.sum(W))

alpha = 0.85
mu = (1/alpha) - 1
beta = mu/(1+mu)
D = (np.power(np.sum(W, 1), -0.5))
D = np.squeeze(np.asarray(D))
D = np.diag(D)
S = D*W*D

## Instead of propagation with regularization
predictions_score = beta*(np.linalg.inv(np.identity(W.shape[0]) - alpha*S)) * init_labels

pred_results = np.asarray(predictions_score)
pred_labels = np.argmax(np.asarray(predictions_score), axis=1)
cnt = 0
for i in range(pred_results.shape[0]):
    if pred_results[i][0] == pred_results[i][1]:
        pred_labels[i] = -1
        cnt+=1
print('#inferred equally: {}'.format(cnt))


# Find labels for unknown users
counter = 0
for i in range(len(labeled_labels)):
    if pred_labels[i] == labeled_labels[i]:
        counter += 1

print("{} remained same!".format(float(counter)/len(labeled_users)))

# Find inferred scores for unkonwn users
inferred_scores = {}
for i in range(len(all_users)):
    if users_leanings[all_users[i]] != -1:
        if users_leanings[all_users[i]] == 0:
            inferred_scores[all_users[i]] = {'left': 1, 'right': 0}
        elif users_leanings[all_users[i]] == 1:
            inferred_scores[all_users[i]] = {'left': 0, 'right': 1}
    else:
        inferred_scores[all_users[i]] = {'left': pred_results[i][0], 'right': pred_results[i][1]}


diff_users = list(set(connection_list.keys()).difference(set(G.nodes)))
for user_id in diff_users:
    if users_leanings[user_id] == 0:
        inferred_scores[user_id] = {'left': 1, 'right': 0}
    elif users_leanings[user_id] == 1:
        inferred_scores[user_id] = {'left': 0, 'right': 1}
    elif users_leanings[user_id] == -1:
        inferred_scores[user_id] = {'left': 0, 'right': 0}

        
pickle.dump(inferred_scores, open('data/social_media/{}/ea_users_inferred_leanings_scores.pkl'.format(campaign), 'wb'))

