In [1]:
from glob import glob
import pandas as pd
import networkx as nx
import numpy as np
import math
import pickle
'''
Though not explicitly stated, I believe this "stopwords" package originates
from this pip package: https://pypi.org/project/stop-words/

It was the first result I found that shares the same name and get_stop_words
function call. 
'''
import stop_words
from dateutil import parser
from collections import defaultdict, Counter
import re
from nltk.stem import WordNetLemmatizer

# don't condense large numbers to scientific notation
pd.set_option('float_format', '{:f}'.format)

filenames = glob("data/BTC/*.csv")
stopwords = set(stop_words.get_stop_words('en'))
stopwords.update(['quote', 'pmquote', 'amquote', 'just', 'don', 'one', 'thing', 'even', 'way', 'maybe', 'also', 'please', 'well', 'actually', 'something',
                                         'going', 'anything', 'le', 'ever', 'say', 'see', 'likely', 'per', 'another', 'someone', 'let', 'anyone', 'doesn', 'include', 'doe', 'exactly',
                                         'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'like',
                                         'said', 'guy', 'will', 'can', 'able', 'people', 'become', 'tell', 'hey', 'much', 'many', 'lol', 'lot', 'want', 'still', 'really', 'think', 'didn',
                                         'isn', 'post', 'edited', 'share', 'facebookshare', 'twitter'])
lemmatizer = WordNetLemmatizer()


In [2]:
'''
Current preprocessing protocol

Remove:
Hyperlinks
Characters not in the English alphabetical character set a-z or A-Z
stopwords
words less than 3 characters
'bitcoins' and replace with 'bitcoin'

'''
def parse_string(input_string):
    input_string = input_string.lower()
    # remove links
    input_string = re.sub(r'http\S+', ' ', input_string)
    input_string = re.sub(r'\S+.(com|org)', '', input_string)
    # remove all non-English alphabet characters including numbers,
    # foreign and special characters
    input_string = re.sub( "[^a-zA-Z]", " ", input_string).split()

    # lemmatize word
    words = [lemmatizer.lemmatize(w) for w in input_string]
    # get rid of stopwords and words less than 3 characters
    words = [w for w in words if w not in stopwords and len(w) > 2]
    # change bitcoins to bitcoin
    words = [w if w != 'bitcoins' else 'bitcoin' for w in words]
    return words


Reference for Original bitcoin data format

Post {
    post_user -> name of user
    title
    posted_time
    post_body
    comments -> array of dictionaries with these fields {
        // comment fields
        post_user
        title
        posted_time
        post_body
    }
}

Unfortunately, it appears that this dataset does not include the author_id
field which uniquely identifies each user, which is necessary to see the user
a comment is directed towards. Right now in_reply_to_screen_name will have to 
do for now, but that will introduce some error with people changing usernames

<b><h2>Relevant data</h2></b>
<p>
 created_at
 favorite_count ,
 id ,
 in_reply_to_screen_name ,
 in_reply_to_user_id ,
 retweet_count ,
 retweet_id , retweet_screen_name ,
 text ,
 user_name ,
 user_statuses_count ,
 user_urls , user_verified 
</p>

<br><b>Irrelevant data</b> <i><br>
Reference for new Twitter data format
 coordinates ,
 hashtags ,
 urls ,
 place ,
 possibly_sensitive ,
 source ,
 tweet_url ,
 user_created_at ,
 user_screen_name ,
 user_default_profile_image ,
 user_description ,
 user_favourites_count ,
 user_followers_count ,
 user_friends_count ,
 user_listed_count ,
 user_location ,
</i>


<p>
Note: original paper had data segmented into posts, with an array of comments to 
iterate through, so I will attempt to do the same segmentation
</p>

In [4]:
total_posts = 0
total_reple = 0
preprocessed_data = {}
preprocessed_data['user_network'] = nx.DiGraph()

# a dictionary of user's posts, their time, quantity
# accessed via preprocessed_data[field][user_name]
preprocessed_data['time_posts'] = defaultdict(list)
preprocessed_data['user_time_posts'] = defaultdict(dict)
preprocessed_data['user_posts'] = defaultdict(list)
preprocessed_data['user_posts_num'] = defaultdict(int)
preprocessed_data['get_comment_num'] = defaultdict(int)
preprocessed_data['write_comment_num'] = defaultdict(int)
preprocessed_data['posts'] = []
voca = set()
word_freq = Counter()


# list of datapoints we don't need
drop_columns = ["coordinates",
                "coordinates",
                "hashtags",
                "urls",
                "place",
                "possibly_sensitive",
                "source",
                # "tweet_url",
                "user_created_at",
                "user_screen_name",
                "user_default_profile_image",
                "user_description",
                "user_favourites_count",
                "user_followers_count",
                "user_friends_count",
                "user_listed_count",
                "user_location",
                "in_reply_to_status_id" ]


df = pd.concat([pd.read_csv(filename) for filename in filenames])


df["user_name"].dropna(inplace=True)
df["in_reply_to_status_id"].dropna(inplace=True)
df["in_reply_to_screen_name"].dropna(inplace=True)
df["created_at"].dropna(inplace=True)

df.sort_values("user_name", inplace=True)

KeyboardInterrupt: 

In [None]:
# deprecated (slow)
"""
# loop through the dataframe and link together all of the relevant posts 
# and comments
for index, row in df.iterrows():
    row = row.copy()

    # reply to id is nan, meaning it's a post
    # set the comments equal to the ids where someone is replying to this user

    if math.isnan(row["in_reply_to_user_id"]):
        df["comments"][index] = df.loc[df["in_reply_to_screen_name"] == row["user_name"]]["id"]
        
    # omit nan values from id
    if type(df["comments"][index]) == float:
            df["comments"][index] = pd.Series(dtype=object)
"""

In [5]:
'''
Construct a new dataframe from the existing data in order to construct our digraph. The old
code assumes that for each post, there is a number of comments directly linked to that post.
For this dataset, however, each comment is linked to a user, which is still enough info to 
construct the graph, but does not allow us to process it in the same manner. 

The code below, therefore will link together user_name and a set of comments. 
Importantly, if the user the comments are directed towards doesn't exist, their total 
comments will be zero, as we can derive nothing from them. 
'''

user_comments_dict = {}
nil_users = {}

# create dictionary which maps usernames to a dictionary of 
for index, user_name in enumerate(df["in_reply_to_screen_name"]): 

    if index % 50000 == 0:
        print(f"\r{index/len(df) * 100:.2f}% done", end='')
        # print(f"\r{index} done", end='')

    # skip nan values
    if type(user_name) == float:
        continue

    # cache names of users who are not in df to save time searching for their name
    if user_name in nil_users:
        continue
    elif user_name in user_comments_dict:
        user_comments_dict[user_name].add(index)

    # if they don't have a post in the df, skip and memoize them 
    # if len(df[df["user_name"] == user_name]) == 0:
    #     nil_users[user_name] = 0
    #     continue

    # bst search index
    search_index = df["user_name"].searchsorted(user_name)

    # col 26 is user_name, remove this if statement to retain users who have comments
    # directed towards them but no actual posts.
    if not(df.iat[search_index, 26] == user_name):
        nil_users[user_name] = 0
        continue

    #  user has comments, init and add it here
    elif not(user_name in user_comments_dict):
        user_comments_dict[user_name] = set([index])


print(f"\r{100:.2f}% done", end='')

100.00% done

In [7]:
# update word count
for i, text in enumerate(df["text"]):

    if i % 10000 == 0:
        print(f"\r{i/len(df) * 100:.2f}% done", end='')
        
    text_body = parse_string(text)
    word_freq.update(text_body)

print(f"\r{100:.2f}% done", end='')


62.41% done

KeyboardInterrupt: 

In [49]:
# for ascertaining the legitimacy of the values above. Top values should be influencers,
# news sites, celebrities, etc
ordered = sorted([(key, len(val)) for key, val in user_comments_dict.items()], key=lambda e: e[1], reverse=True)
ordered

3503356
4326569


In [12]:
word_freq

Counter()

In [11]:
# dictionary to keep track of what users have already had comments added
# to them. Used to prevent double-counting. 
has_comments_added_dict = {}

counter = 0

total_len = len(df[df["in_reply_to_screen_name"].isna()])
# only loop through posts. Comments will be counted if the user is found
# to be in the comments_user_dict 
# note: takes ~8 minutes on my machine
for index, row in df[df["in_reply_to_screen_name"].isna()].iterrows():

    if counter % 5000 == 0:
        print(f"\r{counter/total_len * 100:.2f}% done", end='')
        # print(f"\r{len(preprocessed_data['posts'])} done", end='')

    counter += 1


    post_body = parse_string(row['text'])
    post_body = [w for w in post_body if word_freq[w] >= 10]
    
    if len(post_body) < 5:
        continue

    print(1)
    voca.update(post_body)
    post_user = row['user_name']
    posted_time = parser.parse(row['created_at']).date()
    
    preprocessed_data['user_posts'][post_user].append(post_body)
    preprocessed_data['user_posts_num'][post_user] += 1
    preprocessed_data['time_posts'][posted_time].append(post_body)

    # also add the same post data to a dictionary accessible by
    # the user_name and time
    if posted_time in preprocessed_data['user_time_posts'][post_user]:
        preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
    else:
        preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
    
    preprocessed_data['posts'].append(post_body)

    # if they don't have any comments, we can skip this next step
    if not(post_user in user_comments_dict) or post_user in has_comments_added_dict:
        continue

    print(2)

    # loop through the comments for the post and construct the digraph
    for comment_index in user_comments_dict[post_user]:

        comment = df.iloc[comment_index]

        # mark them in dict so the comments are not double-counted
        has_comments_added_dict[post_user] = 0
        
        comment_body = parse_string(comment['text'].iat[0])
        comment_body = [w for w in comment_body if word_freq[w] >= 10]
        if len(comment_body) < 5:
            continue
        voca.update(comment_body)
        comment_user = comment['user_name'].iat[0]
        comment_time = parser.parse(comment['created_at'].iat[0]).date()
            
        preprocessed_data['user_posts'][comment_user].append(comment_body)
        preprocessed_data['time_posts'][comment_time].append(comment_body)
        if posted_time in preprocessed_data['user_time_posts'][post_user]:
            preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
        else:
            preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
        
        preprocessed_data['posts'].append(comment_body)
        preprocessed_data['user_network'].add_edge(comment_user, post_user)
        preprocessed_data['get_comment_num'][post_user] += 1
        preprocessed_data['write_comment_num'][comment_user] += 1

        print(3)
            

print(f"\r{100:.2f}% done", end='')
print("pickling...")

voca = list(voca)
preprocessed_data['voca'] = voca
preprocessed_data['word_freq'] = word_freq
with open("preprocessed_bitcoin.pkl", 'wb') as f:
    pickle.dump(preprocessed_data, f)





0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0


KeyboardInterrupt: 