In [1]:
from glob import glob
import pandas as pd
import networkx as nx
import pickle
import os
'''
Though not explicitly stated, I believe this "stopwords" package originates
from this pip package: https://pypi.org/project/stop-words/

It was the first result I found that shares the same name and get_stop_words
function call. 
'''
import stop_words
from dateutil import parser
from collections import defaultdict, Counter
import re
from nltk.stem import WordNetLemmatizer
import matplotlib.pyplot as plt



# don't condense large numbers to scientific notation
pd.set_option('float_format', '{:f}'.format)

filenames = glob("data/BTC/*.csv")
stopwords = set(stop_words.get_stop_words('en'))
stopwords.update(['quote', 'pmquote', 'amquote', 'just', 'don', 'one', 'thing', 'even', 'way', 'maybe', 'also', 'please', 'well', 'actually', 'something',
                                         'going', 'anything', 'le', 'ever', 'say', 'see', 'likely', 'per', 'another', 'someone', 'let', 'anyone', 'doesn', 'include', 'doe', 'exactly',
                                         'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', 'like',
                                         'said', 'guy', 'will', 'can', 'able', 'people', 'become', 'tell', 'hey', 'much', 'many', 'lol', 'lot', 'want', 'still', 'really', 'think', 'didn',
                                         'isn', 'post', 'edited', 'share', 'facebookshare', 'twitter'])
                                        # adding exclusions to 'bitcion' because presumably, the entire dataset contains
                                        # bitcoin-related tweets, which means mentions of bitcoin shouldn't add any value
lemmatizer = WordNetLemmatizer()


In [2]:
'''
Current preprocessing protocol

Remove:
Hyperlinks
Characters not in the English alphabetical character set a-z or A-Z
stopwords
words less than 3 characters
'bitcoins' and replace with 'bitcoin'

'''
def parse_string(input_string):
    input_string = input_string.lower()
    # remove links
    input_string = re.sub(r'http\S+', ' ', input_string)
    input_string = re.sub(r'\S+.(com|org)', '', input_string)
    # remove all non-English alphabet characters including numbers,
    # foreign and special characters
    input_string = re.sub( "[^a-zA-Z]", " ", input_string).split()

    # lemmatize word
    words = [lemmatizer.lemmatize(w) for w in input_string]
    # get rid of stopwords and words less than 3 characters
    words = [w for w in words if w not in stopwords and len(w) > 2]

    # fix common misspellings of bitcoin
    words = ["bitcoin" if w == "bitcoins" else w for w in words]
    words = ["bitcoin" if w == "itcoin" else w for w in words]

    return words


In [3]:
total_posts = 0
total_reple = 0
preprocessed_data = {}
preprocessed_data['user_network'] = nx.DiGraph()

# a dictionary of user's posts, their time, quantity
# accessed via preprocessed_data[field][user_name]
preprocessed_data['time_posts'] = defaultdict(list)
preprocessed_data['user_time_posts'] = defaultdict(dict)
preprocessed_data['user_posts'] = defaultdict(list)
preprocessed_data['user_posts_num'] = defaultdict(int)
preprocessed_data['get_comment_num'] = defaultdict(int)
preprocessed_data['write_comment_num'] = defaultdict(int)
preprocessed_data['posts'] = []
voca = set()
word_freq = Counter()


fields = ['user_name', 'created_at', 'text', 'id', 'in_reply_to_screen_name']

df = pd.concat([pd.read_csv(filename, nrows=None, usecols=fields) for filename in filenames])

# drop null usernames and text
df = df[df['user_name'].notna()]
df = df[df['created_at'].notna()]
df = df[df['text'].notna()]

df.reset_index(inplace=True)
df.sort_values("user_name", inplace=True)

# twitter usernames are not case sensitive so convert everything to lowercase
df['user_name'] = df['user_name'].str.lower()
df['in_reply_to_screen_name'] = df['in_reply_to_screen_name'].str.lower()
df['text'] = df['text'].str.lower()

if len(df[df.index.duplicated()]) > 0:
    print("duplicate indices found! Something went wrong.")

orig_n_edges = len(df[df['in_reply_to_screen_name'].notna()])
orig_n_nodes = len(df[df['in_reply_to_screen_name'].isna()]['user_name'].unique())
orig_n_posts = len(df[df['in_reply_to_screen_name'].isna()])
n_deduplicated = len(df) - (orig_n_edges + orig_n_nodes)

print(f"dataframe size {len(df)}")
print(f"Number of edges {orig_n_edges}")
print(f"Number of unique nodes {orig_n_nodes}")
# show the number of nodes that were not double counted as a result of using 'unique'
print(f"Nodes de-duplicated {n_deduplicated}")
# percentage of nodes removed. 0% would mean every user only has one post. 
# This number trends towards 100% as the ratio of posts to users increases
print(f"De-duplication percentage {100 * n_deduplicated / orig_n_posts:.2f}%")

dataframe size 4326298
Number of edges 823069
Number of unique nodes 296975
Nodes de-duplicated 3206254
De-duplication percentage 91.52%


In [4]:
# show counts for number of posts for each user, some range in the thousands
post_freq_dist = df[df['in_reply_to_screen_name'].isna()].groupby('user_name').count().sort_values('id')["index"]

# it appears that most nodes are identified by a single post
print(f"mean {post_freq_dist.mean()}")
print(f"median {post_freq_dist.median()}")
print(f"mode {post_freq_dist.mode()[0]}")
print()
print(f"min {post_freq_dist.min()}")
print(f"max {post_freq_dist.max()}")

mean 11.796376799393888
median 1.0
mode 1

min 1
max 43886


In [5]:
'''
Construct a new dataframe from the existing data in order to construct our digraph. The old
code assumes that for each post, there is a number of comments directly linked to that post.
For this dataset, however, each comment is linked to a user, which is still enough info to 
construct the graph, but does not allow us to process it in the same manner. 

The code below, therefore will link together user_name and a set of comments. 
Importantly, if the user the comments are directed towards doesn't exist, their total 
comments will be zero, as we can derive nothing from them. 
'''

user_comments_dict = {}

counter = 0

# create dictionary which maps usernames to a set of comment indices
# start by looping through comments
for index, user_name in df["in_reply_to_screen_name"].items(): 

    if counter % 10000 == 0:
        print(f"\r{counter/len(df) * 100:.2f}% done", end='')

    counter += 1

    # skip nan values
    if type(user_name) == float:
        continue

    user_loc = df['user_name'].searchsorted(user_name)

    # the user this comment is pointing to doesn't have a post
    if not(user_name in df.iloc[user_loc]['user_name']):
        continue

    if user_name in user_comments_dict:
        user_comments_dict[user_name].append(df.loc[index])
    #  user has comments, init and add it here
    else:
        user_comments_dict[user_name] = [df.loc[index]]



# drop all users without any comments pointing to them
df = df[df['user_name'].isin(user_comments_dict.keys())]

# remove comments, as they're already listed in the user comments dict
df = df[df['in_reply_to_screen_name'].isna()]

n_commented_nodes = len(user_comments_dict)
n_commented_posts = sum(df.groupby('user_name').count().sort_values('id')["index"])

print(f"\r{100:.2f}% done", end='')

print()
print()
print(f"Number of commented nodes {n_commented_nodes}")
print()
print(f"Number of commented posts {n_commented_posts}")
print(f"Number of edges {sum([len(vals) for vals in user_comments_dict.values()])}")
print()
print(f"Total node utilization {100 * n_commented_nodes / orig_n_nodes:.2f}%")
print(f"Total post utilization {100 * n_commented_posts / orig_n_posts:.2f}%")

100.00% done

Number of commented nodes 2771

Number of commented posts 118474
Number of edges 52963

Total node utilization 0.93%
Total post utilization 3.38%


In [6]:
# word_freq_file_name = 'pkl/word_freq_pkl' 

# # load cached result. Important: Make sure the number of tweets used is the same in the df!
# if os.path.exists(word_freq_file_name):
#     with open(word_freq_file_name, 'rb') as f:
#         word_freq = pickle.load(f)
# else:

#     with open(word_freq_file_name, 'wb') as f:
#         pickle.dump(word_freq, f)

# update word count
for i, text in enumerate(df["text"]):

    if i % 10000 == 0:
        print(f"\rposts: {i/len(df) * 100:.2f}% done", end='')
        
    text_body = parse_string(text)
    word_freq.update(text_body)

print(f"\rposts: {100:.2f}% done", end='')
print()

i = 0

for comments in user_comments_dict.values():
    if i % 3000 == 0:
        print(f"\rcomments: {i/len(user_comments_dict) * 100:.2f}% done", end='')
        
    for comment in comments:
        word_freq.update(parse_string(comment.text))
        i += 1

print(f"\rcomments: {100:.2f}% done", end='')
print()
print("Number of unique words:", len(word_freq))

posts: 100.00% done
comments: 100.00% done
Number of unique words: 50481


In [7]:
user_processed_dict = {}

counter = 0

voca = set()

min_post_len = 5
min_comment_len = 5
min_word_freq = 10

# counters for how many posts were removed from each source
n_removed_posts = 0
n_removed_comments = 0

# only loop through posts. Comments will be counted if the user is found
# to be in the comments_user_dict 
for row in df.itertuples():

    # absolute index, not implicit
    index = row.Index

    if counter % 5000 == 0:
        print(f"\r{counter/len(df) * 100:.2f}% done", end='')

    counter += 1

    post_body = parse_string(row.text)
    post_body = [w for w in post_body if word_freq[w] >= 10]
    
    # skip posts of four words or less
    if len(post_body) < min_post_len:
        n_removed_posts += 1
        continue

    voca.update(post_body)
    post_user = row.user_name
    posted_time = parser.parse(row.created_at).date()
    
    preprocessed_data['user_posts'][post_user].append(post_body)
    preprocessed_data['user_posts_num'][post_user] += 1
    preprocessed_data['time_posts'][posted_time].append(post_body)

    # also add the same post data to a dictionary accessible by
    # the user_name and time
    if posted_time in preprocessed_data['user_time_posts'][post_user]:
        preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
    else:
        preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
    
    preprocessed_data['posts'].append(post_body)

    # comments already processed for this user
    if post_user in user_processed_dict:
        continue

    # loop through the comments for the post and construct the digraph
    for comment in user_comments_dict[post_user]:

        user_processed_dict[post_user] = True

        # skip comments on own post
        if comment.user_name == post_user:
            continue

        comment_body = parse_string(comment.text)
        comment_body = [w for w in comment_body if word_freq[w] >= min_word_freq]
        
        if len(comment_body) < min_comment_len:
            n_removed_comments += 1
            continue

        voca.update(comment_body)
        comment_user = comment.user_name
        comment_time = parser.parse(comment.created_at).date()
            
        preprocessed_data['user_posts'][comment_user].append(comment_body)
        preprocessed_data['time_posts'][comment_time].append(comment_body)

        if posted_time in preprocessed_data['user_time_posts'][post_user]:
            preprocessed_data['user_time_posts'][post_user][posted_time].append(post_body)
        else:
            preprocessed_data['user_time_posts'][post_user][posted_time] = [post_body]
        
        preprocessed_data['posts'].append(comment_body)
        preprocessed_data['user_network'].add_edge(comment_user, post_user)
        preprocessed_data['get_comment_num'][post_user] = len(user_comments_dict[post_user])
        preprocessed_data['write_comment_num'][comment_user] += 1
            

print(f"\r{100:.2f}% done", end='')
print("\npickling...")

voca = list(voca)
preprocessed_data['voca'] = voca
preprocessed_data['word_freq'] = word_freq
with open("pkl/preprocessed_bitcoin.pkl", 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("Number of unique vocabulary words", len(voca))
print(f"Removed {n_removed_posts} posts")
print(f"Removed {n_removed_comments} comments")
print()
print("finished")



100.00% done
pickling...
Number of unique vocabulary words 8984
Removed 8341 posts
Removed 7217 comments

finished


In [8]:
# nx.draw_shell(preprocessed_data["user_network"], with_labels=True)


In [9]:
b = [len(a) for a in list(preprocessed_data['user_network'].adj.values())]
print(max(b))


27
