# Jupyterlab version

# Extract and process data into nodes and edges

In [4]:
import pickle
import json
import gzip
from pprint import pprint
import os.path

# Import data

Assumes that the data files are in a subfolder called 'Data'

In [5]:
# Written by Alex
# unzips the file and goes through it
# 1 line is 1 post

data_path = 'Data/'  #change dir to your project folder
filename = os.path.join(data_path, 'processed.1850k.jsonl.gz')

def iterate_posts():
    with gzip.open(filename, 'rb') as fd:
      for line in fd:
          yield json.loads(line)
            
gen = iterate_posts()
#next(gen)

In [6]:
# Collect a set of all usernames found in posts
# Using posts.pickle.gz allows 1 line = 1 name
# This file breaks replies off from the parent post
# so that we don't have to worry about parsing multiple
# names in one line.
# Once we haved a list of each username from each post,
# remove duplicates to get a full list of unique usernames
from collections import Counter

filename2 = os.path.join(data_path, 'posts.pickle.gz')

single_posts = pickle.load(gzip.open(filename2, 'rb'))
nodes = []
for post in single_posts.values():
    nodes.append(post['author'].replace(" ", "_")) # Need to remove spaces in a few names
    
user_post_count = Counter(nodes)
nodes = list(set(nodes))

## Process the data
### Everything after here is platform agnostic

In [7]:
# Get a dictionary of user post frequencies by username sorted by post frequencies, descending
# The sorting isn't really necessary for the next part,
# it just makes it easier to spot check manually

sorted_user_post_count = sorted(dict(user_post_count).items(), key=lambda item:item[1], reverse=True)
sorted_user_post_count[:10]

[('@Private_User', 37703),
 ('@ThomasFox', 537),
 ('@IamfromQ', 448),
 ('@CounterGlobalist', 408),
 ('@TommyRobinson', 383),
 ('@mitchellvii', 339),
 ('@LibertyElaine', 310),
 ('@WeLoveTrump', 305),
 ('@handsomebuster', 303),
 ('@MIZ37YandMaYh3m', 300)]

In [8]:
# Get a list of users who only posted a certain number of times
# The number of posts may be adjusted to meet filtering needs

# CHANGE THIS VARIABLE TO ADJUST USER FILTERING
# If minimum posts = 2, there is no need to filter them. 
# Users with 1 post will be removed when collecting edges
minimum_posts = 10

low_posters = [user for user, freq in sorted_user_post_count if freq<minimum_posts]
len(low_posters)

173073

In [9]:
def get_list_of_users(post):
    """ 
    Extract all usernames from each post

    Note:
        There can be None, one, or more usernames in a post.
     
     Args:
         post (dict): Yielded by iterate_posts()

    Attributes:
        users (dict): Used to collect usernames 
        
    Returns:
        users: Tuple of usernames from each post. Can contain any number of usernames.
    """    

    users = []
    try:
        for post_item in post['posts']:                
            users.append(post_item['author_username'].replace(" ", "_"))
    except KeyError:
        return tuple(users) #list of usernames from ONE(1) post

In [10]:
# Collect a list of tuples of of length 2 or more usernames from posts
# Filters out lists of None and lists with only 1 user. This is why 'minimum_posts' of 2 is unneeded.
# The exception is triggered by reaching the end of the JSONL file.
def get_edges():
    user_tuple_list=[]
    edges_temp=[]

    try:
        while True:
            user_tuple_list.append(get_list_of_users(next(gen))) 
    except:  
        user_tuple_list = list(set(user_tuple_list))
        for user_list in user_tuple_list:
            if user_list != None and len(user_list)>1:
                edges_temp.append(user_list)
            else:
                continue
        print("Complete")
        return edges_temp

In [11]:
def split_edges(all_edges):
    """  
    Split tuples into tuples of length 2 and remove duplicates.

    Args:
        all_edges (list of tuples): These tuples vary in length.

    Attributes: 
        user_list (list): Keeps tuples generated/passed by for loop.
        
    Returns:
        user_list (list of tuples): Each tuple is of length 2 and duplicates are removed.
        
    """

    user_list = []
    
    for user_tuple in all_edges:
        if len(user_tuple)==2:
            user_list.append(user_tuple)
        else:
            for username in user_tuple:
                if username!=user_tuple[0]:
                    user_list.append((user_tuple[0],username))
                else:
                    continue
            continue
    return list(set(user_list))

In [12]:
'''# Remove from edges those users who only posted a few times
edges = [tup for tup in edges if not any(i in tup for i in low_posters)] '''

'# Remove from edges those users who only posted a few times\nedges = [tup for tup in edges if not any(i in tup for i in low_posters)] '

In [13]:
def remove_low_posters(edges):
    temp_list = []
    for username_tuple in edges:
        name_list = []
        for username in username_tuple:
            if username in low_posters:
                pass
            else: 
                name_list.append(username)
        temp_list.append(name_list)
    return temp_list

In [14]:
all_edges = get_edges()
print(f'all_edges is a {type(all_edges)} of {type(all_edges[0])} containing {type(all_edges[0][0])}')
len(all_edges)

Complete
all_edges is a <class 'list'> of <class 'tuple'> containing <class 'str'>


141016

In [15]:
# Make all elements of edges to be tuples of length 2 with duplicates removed.
edges = split_edges(all_edges)
len(edges)

229235

In [16]:
high_posters = remove_low_posters(edges)

In [20]:
len(high_posters)

229235

In [21]:
len(edges)

229235

In [17]:
print(f'There are {len(nodes)} individual users.')
print(f'There are {len(low_posters)} users who made less than {minimum_posts} posts who can be removed.')
print(f'That will leave us with {len(edges)} users to graph.')

There are 185207 individual users.
There are 173073 users who made less than 10 posts who can be removed.
That will leave us with 229235 users to graph.


In [19]:
# save the nodes and edges to a file
# TODO: Write code to read in the files to the appropriate variables so that we don't have to run all of the above code again
graph_path = 'Graph/'

open(os.path.join(graph_path,'edges without low posters removed.txt'), 'w').write('\n'.join('%s %s' % x for x in edges))
open(os.path.join(graph_path,'edges with low posters removed.txt'), 'w').write('\n'.join('%s %s' % x for x in high_posters))
open(os.path.join(graph_path,'nodes.txt'), 'w').write('\n'.join('%s' % x for x in nodes))

TypeError: not enough arguments for format string

In [None]:
print(f'There are {len(nodes)} nodes in the graph.')
print(f'There are {len(all_edges)} total edges, but this includes duplicates and multiple replies to one post.')
print(f'There are {len(edges)} edges once we split the posts and remove duplicates.')      