# Jupyterlab version

# Extract and process data into nodes and edges

In [16]:
import pickle
import json
import gzip
from pprint import pprint
import os.path

# Import data

Assumes that the data files are in a subfolder called 'Data'

In [17]:
# Written by Alex
# unzips the file and goes through it
# 1 line is 1 post

data_path = 'Data/'  #change dir to your project folder
filename = os.path.join(data_path, 'processed.1850k.jsonl.gz')

def iterate_posts():
    with gzip.open(filename, 'rb') as fd:
      for line in fd:
          yield json.loads(line)
            
gen = iterate_posts()
#next(gen)

In [18]:
# Collect a set of all usernames found in posts
# Using posts.pickle.gz allows 1 line = 1 name
# This file breaks replies off from the parent post
# so that we don't have to worry about parsing multiple
# names in one line.
# Once we haved a list of each username from each post,
# remove duplicates to get a full list of unique usernames
from collections import Counter

filename2 = os.path.join(data_path, 'posts.pickle.gz')

single_posts = pickle.load(gzip.open(filename2, 'rb'))
nodes = []
for post in single_posts.values():
    nodes.append(post['author'].replace(" ", "_")) # Need to remove spaces in a few names
    
user_post_count = Counter(nodes)


In [19]:
top_10k = user_post_count.most_common(10001)
top_10k = top_10k[1:10001]
len(top_10k)

10000

In [20]:
nodes = [item[0] for item in top_10k]

In [39]:
top_10k

[('@ThomasFox', 537),
 ('@IamfromQ', 448),
 ('@CounterGlobalist', 408),
 ('@TommyRobinson', 383),
 ('@mitchellvii', 339),
 ('@LibertyElaine', 310),
 ('@WeLoveTrump', 305),
 ('@handsomebuster', 303),
 ('@MIZ37YandMaYh3m', 300),
 ('@epochtimes', 297),
 ('@Cherokeeowl', 256),
 ('@Auldiedog', 248),
 ('@1DRACARYS', 245),
 ('@RebelAngel78', 241),
 ('@americanshomer', 240),
 ('@ihvurbk2', 234),
 ('@Juanita7', 225),
 ('@Murphycat2012', 224),
 ('@BoroncarbideBonerMan', 221),
 ('@ShadowOfDeath', 218),
 ('@HelloFranklin', 217),
 ('@Washingtonsangeloflight', 216),
 ('@DCROKIT', 210),
 ('@Viper8', 208),
 ('@BuckeyeSIGI', 207),
 ('@Hrothenb', 206),
 ('@Tireshoptutoring', 200),
 ('@Vasilievich', 198),
 ('@GenZConservative1', 197),
 ('@IMPatriotRU', 188),
 ('@Terriehellman1', 187),
 ('@cjtruth', 186),
 ('@Lindas123', 186),
 ('@ViolentVixen29', 185),
 ('@Jennie33', 185),
 ('@Cjames9702', 185),
 ('@EllieBOfficial', 184),
 ('@RedLiner', 177),
 ('@Glorph', 175),
 ('@LCandy', 175),
 ('@candycarboni', 175),

SyntaxError: EOL while scanning string literal (<ipython-input-38-b763f69c4f4d>, line 1)

In [36]:
nodes[:100]

['@ThomasFox',
 '@IamfromQ',
 '@CounterGlobalist',
 '@TommyRobinson',
 '@mitchellvii',
 '@LibertyElaine',
 '@WeLoveTrump',
 '@handsomebuster',
 '@MIZ37YandMaYh3m',
 '@epochtimes',
 '@Cherokeeowl',
 '@Auldiedog',
 '@1DRACARYS',
 '@RebelAngel78',
 '@americanshomer',
 '@ihvurbk2',
 '@Juanita7',
 '@Murphycat2012',
 '@BoroncarbideBonerMan',
 '@ShadowOfDeath',
 '@HelloFranklin',
 '@Washingtonsangeloflight',
 '@DCROKIT',
 '@Viper8',
 '@BuckeyeSIGI',
 '@Hrothenb',
 '@Tireshoptutoring',
 '@Vasilievich',
 '@GenZConservative1',
 '@IMPatriotRU',
 '@Terriehellman1',
 '@cjtruth',
 '@Lindas123',
 '@ViolentVixen29',
 '@Jennie33',
 '@Cjames9702',
 '@EllieBOfficial',
 '@RedLiner',
 '@Glorph',
 '@LCandy',
 '@candycarboni',
 '@Sal72H',
 '@Cobrarick98',
 '@chucknellis',
 '@Ediekiss',
 '@UnsilentMajority2021',
 '@JorjalexSTOPTHESTEAL',
 '@Aliceitstime0817',
 '@ConservativeLibrarian',
 '@Dbongino',
 '@gnewsorg',
 '@BNelms',
 '@NeverForget162021',
 '@AmericanConservativesNews',
 '@Onebadolepuddycat',
 '@TRUMP

In [23]:
def get_list_of_users(post):
    """ 
    Extract all usernames from each post

    Note:
        There can be None, one, or more usernames in a post.
     
     Args:
         post (dict): Yielded by iterate_posts()

    Attributes:
        users (dict): Used to collect usernames 
        
    Returns:
        users: Tuple of usernames from each post. Can contain any number of usernames.
    """    

    users = []
    try:
        for post_item in post['posts']:                
            users.append(post_item['author_username'].replace(" ", "_"))
    except KeyError:
        if len(users)>1:
            return tuple(users) #list of usernames from ONE(1) post
        else:
            return None

In [24]:
# Collect a list of tuples of of length 2 or more usernames from posts
# Filters out lists of None and lists with only 1 user. This is why 'minimum_posts' of 2 is unneeded.
# The exception is triggered by reaching the end of the JSONL file.
def get_edges():
    user_tuple_list=[]
    edges_temp=[]

    try:
        while True:
            user_tuple_list.append(get_list_of_users(next(gen))) 
    except:  
        user_tuple_list = list(set(user_tuple_list))
        for user_list in user_tuple_list:
#            if user_list != None and len(user_list)>1:
            if user_list != None:
                edges_temp.append(user_list)
            else:
                continue
        print("Complete")
        return edges_temp

In [25]:
def split_edges(all_edges):
    """  
    Split tuples into tuples of length 2 and remove duplicates.

    Args:
        all_edges (list of tuples): These tuples vary in length.

    Attributes: 
        user_list (list): Keeps tuples generated/passed by for loop.
        
    Returns:
        user_list (list of tuples): Each tuple is of length 2 and duplicates are removed.
        
    """

    user_list = []
    
    for user_tuple in all_edges:
        if len(user_tuple)==2:
            user_list.append(user_tuple)
        else:
            for username in user_tuple:
                if username!=user_tuple[0]:
                    user_list.append((user_tuple[0],username))
                else:
                    continue
            continue
    return list(set(user_list))

In [26]:
def remove_low_posters(edges):
    temp_list = []
    for username_tuple in edges:
        name_list = []
        for username in username_tuple:
            if username in nodes:
                name_list.append(username)
            else:
                pass
        if len(name_list)>1:
            temp_list.append(tuple(name_list))
        else:
            pass
    return temp_list

In [27]:
all_edges = get_edges()
print(f'all_edges is a {type(all_edges)} of {type(all_edges[0])} containing {type(all_edges[0][0])}')
len(all_edges)

Complete
all_edges is a <class 'list'> of <class 'tuple'> containing <class 'str'>


141016

In [28]:
# Make all elements of edges to be tuples of length 2 with duplicates removed.
edges = split_edges(all_edges)
len(edges)

229235

In [29]:
high_posters = remove_low_posters(edges)
len(high_posters)

In [34]:
len(high_posters)

61738

In [30]:
high_posters[:10]

[('@IamfromQ', '@StayAmerica'),
 ('@MrMayor', '@Rtchavez'),
 ('@BenMyers1262', '@Dalesloc'),
 ('@realwayneroot', '@Jhanco1'),
 ('@CitizenFreePress', '@Mdpope1'),
 ('@realwayneroot', '@FistfulofTrumpism'),
 ('@KPatriot21', '@RepEricSwalwell'),
 ('@Shariwhittyphi', '@adirondackersforamerica'),
 ('@TheNewAmericanConservative', '@Heathermillertime'),
 ('@Linkinbio', '@Pjcali')]

In [35]:
# save the nodes and edges to a file
# TODO: Write code to read in the files to the appropriate variables so that we don't have to run all of the above code again
graph_path = 'Graph/'

open(os.path.join(graph_path,'edges.txt'), 'w').write('\n'.join('%s %s' % x for x in high_posters))
#open(os.path.join(graph_path,'edges with low posters removed.txt'), 'w').write('\n'.join('%s %s' % x for x in high_posters))
open(os.path.join(graph_path,'nodes.txt'), 'w').write('\n'.join('%s' % x for x in nodes))

132794