# Google Colab version
### Note: Google Colab has major issues processing this amount of data

# Extract and process data into nodes and edges

In [1]:
import pickle
import json
import gzip
from pprint import pprint
import os.path

# Import data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Parler Data/'  #change dir to your project folder

In [9]:
# Written by Alex
# unzips the file and goes through it
# 1 line is 1 post
filename = os.path.join(root_path, 'processed.1850k.jsonl.gz')

def iterate_posts():
    with gzip.open(filename, 'rb') as fd:
      for line in fd:
          yield json.loads(line)
            
gen = iterate_posts()
#next(gen)

In [None]:
# Collect a set of all usernames found in posts
# Using posts.pickle.gz allows 1 line = 1 name
# This file breaks replies off from the parent post
# so that we don't have to worry about parsing multiple
# names in one line.
# Once we haved a list of each username from each post,
# remove duplicates to get a full list of unique usernames
from collections import Counter

filename2 = os.path.join(root_path, 'posts.pickle.gz')
single_posts = pickle.load(gzip.open(filename2, 'rb'))
nodes = []
for post in single_posts.values():
    nodes.append(post['author'])
    
user_post_count = Counter(nodes)
nodes = set(nodes)

## Process the data
### Everything after here is platform agnostic

In [4]:
# Get a dictionary of user post frequencies by username sorted by post frequencies, descending
# The sorting isn't really necessary for the next part,
# it just makes it easier to spot check manually

sorted_user_post_count = sorted(dict(user_post_count).items(), key=lambda item:item[1], reverse=True)
sorted_user_post_count[:10]

[('@Private User', 37703),
 ('@ThomasFox', 537),
 ('@IamfromQ', 448),
 ('@CounterGlobalist', 408),
 ('@TommyRobinson', 383),
 ('@mitchellvii', 339),
 ('@LibertyElaine', 310),
 ('@WeLoveTrump', 305),
 ('@handsomebuster', 303),
 ('@MIZ37YandMaYh3m', 300)]

In [5]:
# Get a list of users who only posted a certain number of times
# The number of posts may be adjusted to meet filtering needs

# CHANGE THIS VARIABLE TO ADJUST USER FILTERING
# If minimum posts = 2, there is no need to filter them. 
# Users with 1 post will be removed when collecting edges
minimum_posts = 30 

low_posters = [user for user, freq in sorted_user_post_count if freq<minimum_posts]
len(low_posters)

182811

In [9]:
def get_list_of_users(post):
    """ 
    Extract all usernames from each post

    Note:
        There can be None, one, or more usernames in a post.
     
     Args:
         post (dict): Yielded by iterate_posts()

    Attributes:
        users (dict): Used to collect usernames 
        
    Returns:
        users: Tuple of usernames from each post. Can contain any number of usernames.
    """    

    users = []
    try:
        for post_item in post['posts']:                
            users.append(post_item['author_username'])
    except KeyError:
        return tuple(users) #list of usernames from ONE(1) post

In [None]:
'''# Collect a list of tuples of of length 2 or more usernames from posts
# The exception is triggered by reaching the end of the JSONL file.

edges_temp=[]
all_edges=[]

try:
    while True:
        print("while true")
        temp_list = get_list_of_users(next(gen))
        
        for username in temp_list:
            print("for user in list")
            if username == None:
                print('None')
                pass
            elif username in low_posters:
                print('if user in low')
                temp_list.remove(username)
                low_posters.remove(username)
            else:
                print('else')
                edges_temp.append(username)

except:  
    print("excepting")
    for user_tuple in edges_temp:
        if user_tuple != None and len(user_tuple)>1:
            all_edges.append(user_tuple)
        else:
            pass #was continue
    del edges_temp #free up memory
    print("Complete")'''

In [13]:
# Collect a list of tuples of of length 2 or more usernames from posts
# Filters out lists of None and lists with only 1 user. This is why 'minimum_posts' of 2 is unneeded.
# The exception is triggered by reaching the end of the JSONL file.
def get_edges():
    user_tuple_list=[]
    edges_temp=[]

    try:
        while True:
            user_tuple_list.append(get_list_of_users(next(gen))) 
    except:  
        user_tuple_list = list(set(user_tuple_list))
        for user_list in user_tuple_list:
            if user_list != None and len(user_list)>1:
                edges_temp.append(user_list)
            else:
                continue
        print("Complete")
        return edges_temp

In [52]:
def remove_low_posters(all_edges):
    temp_list = []
    for username_tuple in all_edges:
        name_list = []
        for username in username_tuple:
            if username in low_posters:
                low_posters.remove(username)
            else: 
                name_list.append(username)
        temp_list.append(name_list)
    return temp_list

In [20]:
'''def remove_singles(all_edges):
    for user_tuple in all_edges:
        if len(user_tuple)<2:
            all_edges.remove(user_tuple)
        else:
            continue'''

In [9]:
def split_edges(all_edges):
    """  
    Split tuples into tuples of length 2 and remove duplicates.

    Args:
        all_edges (list of tuples): These tuples vary in length.

    Attributes: 
        user_list (list): Keeps tuples generated/passed by for loop.
        
    Returns:
        user_list (list of tuples): Each tuple is of length 2 and duplicates are removed.
        
    """

    user_list = []
    
    for user_tuple in all_edges:
        if len(user_tuple)==2:
            user_list.append(user_tuple)
        else:
            for username in user_tuple:
                if username!=user_tuple[0]:
                    user_list.append((user_tuple[0],username))
                else:
                    continue
            continue
    return list(set(user_list))

In [None]:
all_edges = get_edges()
print(f'all_edges is a {type(all_edges)} of {type(all_edges[0])} containing {type(all_edges[0][0])}')
len(all_edges)

In [10]:
# Make all elements of edges to be tuples of length 2 with duplicates removed.
edges = split_edges(all_edges)

In [None]:
'''# Remove from edges those users who only posted a few times
edges = [tup for tup in edges if not any(i in tup for i in low_posters)] '''

In [None]:
print(f'There are {len(nodes)} individual users.')
print(f'There are {len(low_posters)} users who made less than {minimum_posts} posts who can be removed.')
print(f'That will leave us with {len(edges)} users to graph.')

In [None]:
# save the nodes and edges to a file
# TODO: Write code to read in the files to the appropriate variables so that we don't have to run all of the above code again
open(os.path.join(root_path,'edges.txt'), 'w').write('\n'.join('%s %s' % x for x in edges))
open(os.path.join(root_path,'nodes.txt'), 'w').write('\n'.join('%s' % x for x in nodes))

In [None]:
print(f'There are {len(nodes)} nodes in the graph.')
print(f'There are {len(all_edges)} total edges, but this includes duplicates and multiple replies to one post.')
print(f'There are {len(edges)} edges once we split the posts and remove duplicates.')      

## Create and process the graph

In [None]:
import matplotlib.pylab as plt
import networkx as nx

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [None]:
G.number_of_nodes()

In [None]:
G.number_of_edges()

In [None]:
S = [G.subgraph(c).copy() for c in nx.connected_components(G)]

In [None]:
S=sorted(S, key=len, reverse=True)

In [None]:
sub=S[1]
# S[1] should have 21 nodes
# S[0] is the big one that takes forever to plot
len(sub)

In [None]:
print(f'There are {sub.number_of_nodes()} nodes, and')
print(f'There are {sub.number_of_edges()} edges in the subgraph')

In [None]:
nx.draw(sub, node_size=100, with_labels=True)

[Degree centrality](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality)

In [None]:
# TODO: sort this dict by value for readability
dc = nx.degree_centrality(sub)

In [None]:
dc

[Kernighan–Lin bipartition algorithm](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.kernighan_lin.kernighan_lin_bisection.html#networkx.algorithms.community.kernighan_lin.kernighan_lin_bisection)

In [None]:
from networkx.algorithms.community import kernighan_lin_bisection
klb = kernighan_lin_bisection(sub)

In [None]:
# TODO: adjust the printing of this for readability
klb

[Greedy Modularity Community](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.modularity_max.greedy_modularity_communities.html#networkx.algorithms.community.modularity_max.greedy_modularity_communities)

In [None]:
from networkx.algorithms.community import greedy_modularity_communities
gmc = greedy_modularity_communities(sub)

In [None]:
# TODO: adjust the printing of this for readability
gmc

In [None]:
# Save the graph to a GXF file for later use
# Can be imported into Gephi
nx.write_gexf(sub, "subgraph.gexf")