In [1]:
from glob import glob
import pandas as pd
import networkx as nx
import pickle
import os
from dateutil import parser
from collections import defaultdict, Counter
import re
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt
from util.util import load, cache
from util.streamer import line_gen
from datetime import datetime
import numpy as np

# don't condense large numbers to scientific notation
pd.set_option('float_format', '{:f}'.format)

filenames = glob("data/BTC/json/*.jsonl")

lemmatizer = WordNetLemmatizer()


In [2]:
total_posts = 0
total_reple = 0
preprocessed_data = {}
preprocessed_data['user_network'] = nx.DiGraph()
# used to represent retweets in order to easily display them separately
# the final graph can be created by composing these two graphs
preprocessed_data['retweet_user_network'] = nx.DiGraph()

# a dictionary of user's posts, their time, quantity
# accessed via preprocessed_data[field][user_name]
preprocessed_data['time_posts'] = defaultdict(list)
preprocessed_data['user_time_posts'] = defaultdict(dict)
preprocessed_data['user_posts'] = defaultdict(list)
# will be a dict of dicts, indexed by post user, indexed by neighbor IDs
# composed of comments/retweets
preprocessed_data['user_comments'] = defaultdict(dict)
preprocessed_data['user_posts_num'] = defaultdict(int)
preprocessed_data['get_comment_num'] = defaultdict(int)
preprocessed_data['write_comment_num'] = defaultdict(int)

# various Twitter metadata
preprocessed_data['verified'] = defaultdict(bool)

preprocessed_data['retweet_count'] = defaultdict(list)
preprocessed_data['followers_count'] = defaultdict(int)
preprocessed_data['favorite_count'] = defaultdict(int)

preprocessed_data['posts'] = []
voca = set()
word_freq = Counter()


In [3]:

# 400,000 tweets is the max my machine can load into RAM (total dataset is about 16 M)
# df = pd.concat([pd.read_json(filename, nrows=n_tweets/len(filenames), lines=True) for filename in filenames])
df = load('btc_data')

df = df.sort_values(by='in_reply_to_user_id')

# convert strings to datetime objects
df['created_at'] = pd.to_datetime(df['created_at'], format='%a %b %d %H:%M:%S %z %Y')

In [25]:
print(df[df['id'] == 964257866914857000].to_string())

                                                                                                                                                                                                                                                                      full_text                created_at        in_reply_to_user_id  retweet_count  favorite_count                  id  verified  followers_count  retweet_id
2917528                                                                                                                                               [jason, facebook, kill, myspace, myspace, killed, myspace, believer, last, mover, first, mover, bitcoin, network, effect] 2019-05-28 00:02:48+00:00                3840.000000              0               1  964257866914857000     False             4051         NaN
2870983                                                                                                                                                                   

In [15]:
' '.join()

TypeError: sequence item 0: expected str instance, list found

In [4]:
print('posts: ', len(df[df['in_reply_to_user_id'].isna()]))
print('comments: ', len(df[df['in_reply_to_user_id'].notna()]))

posts:  2841888
comments:  536494


In [5]:
# date_counts = {}

# for row in df.itertuples():
#     if not row.created_at.year in date_counts:
#         date_counts[row.created_at.year] = 1
#     else:
#         date_counts[row.created_at.year] += 1

# print(date_counts)
# plt.plot(sorted(date_counts.keys()), sorted(date_counts.values()))

In [6]:
df.columns

Index(['full_text', 'created_at', 'in_reply_to_user_id', 'retweet_count',
       'favorite_count', 'id', 'verified', 'followers_count', 'retweet_id'],
      dtype='object')

In [7]:
# word_freq_file_name = 'pkl/word_freq_pkl' 

# # load cached result. Important: Make sure the number of tweets used is the same in the df!
# if os.path.exists(word_freq_file_name):
#     with open(word_freq_file_name, 'rb') as f:
#         word_freq = pickle.load(f)
# else:

#     with open(word_freq_file_name, 'wb') as f:
#         pickle.dump(word_freq, f)

# update word count
for i, text in enumerate(df["full_text"]):

    if i % 10000 == 0:
        print(f"\rposts: {i/len(df) * 100:.2f}% done", end='')
        
    text_body = text
    word_freq.update(text_body)

print(f"\rposts: {100:.2f}% done", end='')
print()
print("Number of unique words:", len(word_freq))

posts: 100.00% done
Number of unique words: 541359


In [8]:
comments_processed_dict = {}
retweets_processed_dict = {}

counter = 0

voca = set()

# only loop through posts. Comments will be counted if the user is found
# to be in the comments_user_dict 
for i, post in enumerate(df[df['in_reply_to_user_id'].isna()].itertuples()):

    post_user = post.id

    # add metadata
    preprocessed_data['verified'][post_user] = post.verified
    preprocessed_data['followers_count'][post_user] = post.followers_count
    preprocessed_data['favorite_count'][post_user] = post.favorite_count

    if i % 5000 == 0:
        print(f"\r{i/len(df[df['in_reply_to_user_id'].isna()]) * 100:.2f}% done", end='')
        print(f"\r{preprocessed_data['user_network']}", end='')


    post_body = post.full_text
    post_body = [w for w in post_body if word_freq[w] >= 10]
    
    if len(post_body) < 5:
        continue

    voca.update(post_body)
    
    posted_time = post.created_at

    # retweet data
    if posted_time in preprocessed_data['retweet_count'][post_user]:
        preprocessed_data['retweet_count'][post_user].append(post.retweet_count)
    else:
        preprocessed_data['retweet_count'][post_user] = [post.retweet_count]
    
    preprocessed_data['user_posts'][post_user].append(post_body)
    preprocessed_data['user_posts_num'][post_user] += 1
    preprocessed_data['time_posts'][posted_time].append(post_body)
    if posted_time in preprocessed_data['user_time_posts'][post_user]:
        preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
    else:
        preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
        
    preprocessed_data['posts'].append(post_body)

    if str(post_user) in comments_processed_dict:
        continue

    # Link comments
    post_comments = df[df['in_reply_to_user_id'] == post_user]
    
    for comment in post_comments.itertuples():
        

        comments_processed_dict[str(post_user)] = True

        comment_body = comment.full_text

        comment_body = [w for w in comment_body if word_freq[w] >= 10]
        if len(comment_body) < 5:
            continue
        
        comment_user = comment.id
        comment_time = comment.created_at

        # Skip self-loops
        if comment_user == post_user:
            continue

        # add comment indexed by id
        preprocessed_data['user_comments'][post_user][comment_user] = comment_body

        voca.update(comment_body)
            
        preprocessed_data['user_posts'][comment_user].append(comment_body)
        preprocessed_data['time_posts'][comment_time].append(comment_body)
        if posted_time in preprocessed_data['user_time_posts'][post_user]:
            preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
        else:
            preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
        
        preprocessed_data['posts'].append(comment_body)
        preprocessed_data['user_network'].add_edge(comment_user, post_user)
        preprocessed_data['get_comment_num'][post_user] += 1
        preprocessed_data['write_comment_num'][comment_user] += 1

    """
    Link retweets

    Currently only going to draw a connection on the graph without doing anything 
    extra like adding the text to the posts as it would just be redundant information.
    """ 
    if str(post_user) in retweets_processed_dict:
        continue

    post_retweets = df[df['retweet_id'] == post_user]

    for retweet in post_retweets.itertuples():

        retweets_processed_dict[str(post_user)] = True

        retweet_user = retweet.id
        retweet_time = retweet.created_at
        
        # Skip self-loops
        if retweet_user == post_user:
            continue

        # add comment indexed by id (content will be original post because it's a tweet)
        preprocessed_data['user_comments'][post_user][retweet_user] = post_body

        # add this to retweet network
        preprocessed_data['retweet_user_network'].add_edge(retweet_user, post_user)
    
        

print(f"\r{100:.2f}% done", end='')
print("\npickling...")

voca = list(voca)
preprocessed_data['voca'] = voca
preprocessed_data['word_freq'] = word_freq
with open("pkl/preprocessed_bitcoin_noloops.pkl", 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Number of unique vocabulary words", len(voca))
print()
print("finished")



100.00% done 91587 nodes and 241040 edges
pickling...
Number of unique vocabulary words 61153

finished


In [9]:
i

2841887

In [10]:
len(df[df['in_reply_to_user_id'].isna()])

2841888

In [11]:
print(preprocessed_data['user_network'])

DiGraph with 91601 nodes and 241100 edges


In [12]:
# nx.draw_shell(preprocessed_data["user_network"], with_labels=True)


In [13]:
# b = [len(a) for a in list(preprocessed_data['user_network'].adj.values())]
# print(max(b))
