In [1]:
from sklearn.decomposition import NMF
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import dok_matrix
from math import log10
import numpy as np
import networkx as nx
import pickle

In [2]:
# use this to select only a subset of the data to speed up processing, None to skip
post_num = 10000

with open("preprocessed_bitcoin.pkl", 'rb') as f:
    data = pickle.load(f)

voca2idx = {w: i for i, w in enumerate(data['voca'])}
voca = data['voca']

if not(post_num is None):
    data['posts'] = data['posts'][:post_num]

In [3]:
# user_score = nx.pagerank(data['user_network'], tol=1e-8, max_iter=200)
hubs, user_score = nx.hits(data['user_network'], max_iter=500)
total_user_num = len(data['user_network'].nodes())
top_users = sorted(user_score, key=user_score.get, reverse=True)

fwrite = open('hits_user_score_post_num.tsv', 'w')
fwrite.write("id\tlog id\tuser\tscore\tpost num\tget comment num\twrite comment num\n")
for i, user in enumerate(top_users):
    fwrite.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
        i + 1, log10(i+1), user, user_score[user], data['user_posts_num'][user], 
        data['get_comment_num'][user], data['write_comment_num'][user]))
fwrite.close()
# 전체 게시물 수와 상위유저+하위유저 게시물 갯수가 다른건 아예 소통이 없던 유저의 게시물이 있기 때문!

In [22]:
len(user_score)

4441

In [4]:
# construct the term frequency matrix for entier community and run 
# topic modeling for it
counter = 0
total_len = len(data['posts'])

tdm = dok_matrix((len(data['posts']), len(voca)), dtype=np.float32)
for i, post in enumerate(data['posts']):

    if counter % 30000 == 0:
        print(f"\r{counter/total_len * 100:.2f}% done", end='')

    for word in post:
        tdm[i, voca2idx[word]] += 1

    counter += 1
        

print(f"\r{100:.2f}% done", end='')

tdm = normalize(tdm)
# compressed sparse row matrix, where row operations are rendered more efficient
# in our case documents are stored in the rows
tdm = tdm.tocsr()
print(tdm.shape)

100.00% done(10000, 29362)


In [5]:
K = 10
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_total = nmf.components_



In [6]:
# 상위 유저 분석!
# segregate users based on hits algorithm. Moving those in the top
# 20 percent into a separate group. top_index is just the cutoff
# point for where this gruop is
acc_sum = 0
top_index = 0
score_sum = sum(user_score.values())
print('sum', score_sum)
for i, top_user in enumerate(sorted(user_score, key=user_score.get, reverse=True)):
    print(top_user)
    acc_sum += user_score[top_user]/score_sum
    if acc_sum > 0.8:
        top_index = i
        break

top_users = top_users[:top_index]
print("n top users ->", len(top_users))

sum 0.9999999999999993
Cointelegraph
Bitcoin
CoinDesk
Bakkt
CNBC
zerohedge
crypto
WhalePanda
CryptoBull
cameron
loomdart
n top users -> 10


In [12]:
# sorted_users = [(key, user_score[key]) for key in sorted(user_score, key=user_score.get, reverse=True)]

AtlasView({'CoinSpice': {}, 'Garfoldd': {}})

In [13]:
user_posts = []
for user in top_users:
    for post in data['user_posts'][user]:
        user_posts.append(post)

In [14]:
tdm = dok_matrix((len(user_posts), len(voca)), dtype=np.float32)
for i, post in enumerate(user_posts):
    for word in post:
        tdm[i, voca2idx[word]] += 1
        
tdm = normalize(tdm)
tdm = tdm.tocsr()
print(tdm.shape)

(5174, 29362)


In [15]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_top = nmf.components_



In [16]:
# 하위 유저 분석!
low_users = sorted(user_score, key=user_score.get, reverse=False)[:-top_index]

user_posts = []
for user in low_users:
    for post in data['user_posts'][user]:
        user_posts.append(post)

In [17]:
tdm = dok_matrix((len(user_posts), len(voca)), dtype=np.float32)
for i, post in enumerate(user_posts):
    for word in post:
        tdm[i, voca2idx[word]] += 1
        
tdm = normalize(tdm)
tdm = tdm.tocsr()
print(tdm.shape)

(62632, 29362)


In [18]:
nmf = NMF(n_components=K, alpha=0.1, max_iter=500)
nmf.fit(tdm)
H_low = nmf.components_



In [19]:
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity
from scipy.optimize import linear_sum_assignment

# hungarian algorithm
top_distances = pairwise_distances(H_total, H_top, metric='cosine')
_, top_indices = linear_sum_assignment(top_distances)

low_distances = pairwise_distances(H_total, H_low, metric='cosine')
_, low_indices = linear_sum_assignment(low_distances)

top_similarity_average = 0
low_similarity_average = 0
for k in range(K):
    top_similarity = cosine_similarity(H_top[top_indices[k]].reshape(1, -1), H_total[k].reshape(1,-1))[0, 0]
    low_similarity = cosine_similarity(H_low[low_indices[k]].reshape(1, -1), H_total[k].reshape(1,-1))[0, 0]
    top_similarity_average += top_similarity
    low_similarity_average += low_similarity
   
    print(f"total users: {k}th topic")
    for i in H_total[k, :].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print(f"top users: {top_indices[k]}th topic, similarity - {top_similarity}")
    for i in H_top[top_indices[k]].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print(f"top users: {low_indices[k]}th topic, similarity - {low_similarity}")
    for i in H_low[low_indices[k]].argsort()[::-1][:20]:
        print(voca[i], end=' ')
    print()
    
    print()
    
top_similarity_average /= K
low_similarity_average /= K

print(top_similarity_average, low_similarity_average)

total users: 0th topic
bitcoin money mining day now transaction news every get price satoshi ethereum make trump gethashed value via block future dollar 
top users: 2th topic, similarity - 0.9765135049819946
bitcoin new future trading bakkt report libra hit atm now via exchange mining launch bank wealth blockchain cryptocurrency wallet place 
top users: 0th topic, similarity - 0.993593692779541
bitcoin trump news libra future high altcoins cash bakkt gold new via network low money cointelegraph volume bank coindesk mining 

total users: 1th topic
market trading analysis report btcusd technical utc idea timeframe crypto btc cryptocurrency bitcoin bearish bullish cap ccn cryptotalksworld service place 
top users: 1th topic, similarity - 0.16515502333641052
block btc unknown transaction time value miner size supply utc byte total issued number price bitclub network called software cryptography 
top users: 4th topic, similarity - 0.28523531556129456
btcusd coinbase bitstamp bitfinex short 