In [1]:
import pickle
import json
import gzip
from pprint import pprint
import os.path

# Import data

## If using Google Colab, use the next three cells to import data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
root_path = 'gdrive/My Drive/Parler Data/'  #change dir to your project folder

Mounted at /content/gdrive


In [28]:
# Written by Alex
# unzips the file and goes through it
# 1 line is 1 post
filename = os.path.join(root_path, 'processed.1850k.jsonl.gz')

def iterate_posts():
    with gzip.open(filename, 'rb') as fd:
      for line in fd:
          yield json.loads(line)
            
gen = iterate_posts()
#next(gen)

NameError: name 'root_path' is not defined

In [None]:
# Collect a set of all usernames found in posts
# Using posts.pickle.gz allows 1 line = 1 name
# This file breaks replies off from the parent post
# so that we don't have to worry about parsing multiple
# names in one line.
# Once we haved a list of each username from each post,
# remove duplicates to get a full list of unique usernames
from collections import Counter

filename2 = os.path.join(root_path, 'posts.pickle.gz')
single_posts = pickle.load(gzip.open(filename2, 'rb'))
nodes = []
for post in single_posts.values():
    nodes.append(post['author'])
    
user_post_count = Counter(nodes)
nodes = set(nodes)

## If using Jupyter Lab, use the next two cells to import data
Assumes that the data files are in the same folder as the .ipynb file

In [2]:
# Written by Alex
# unzips the file and goes through it
# 1 line is 1 post
filename = 'processed.1850k.jsonl.gz'
def iterate_posts():
    with gzip.open(filename, 'rb') as fd:
      for line in fd:
          yield json.loads(line)
            
gen = iterate_posts()
#next(gen)

In [3]:
# Collect a set of all usernames found in posts
# Using posts.pickle.gz allows 1 line = 1 name
# This file breaks replies off from the parent post
# so that we don't have to worry about parsing multiple
# names in one line.
# Once we haved a list of each username from each post,
# remove duplicates to get a full list of unique usernames
from collections import Counter

filename2 = 'posts.pickle.gz'
single_posts = pickle.load(gzip.open(filename2, 'rb'))
nodes = []
for post in single_posts.values():
    nodes.append(post['author'])
    
user_post_count = Counter(nodes)
nodes = list(set(nodes))

## Process the data
### Everything after here is platform agnostic

In [4]:
# Get a dictionary of user post frequencies by username sorted by post frequencies, descending
# The sorting isn't really necessary for the next part,
# it just makes it easier to spot check manually

sorted_user_post_count=sorted(dict(user_post_count).items(), key=lambda item:item[1], reverse=True)

In [5]:
sorted_user_post_count[0]

('@Private User', 37703)

In [6]:
# Get users who only posted a certain number of times
# The number of posts may be adjusted to meet filtering needs

# CHANGE THIS VARIABLE TO ADJUST USER FILTERING
minimum_posts = 4 

low_posters = [user for user in sorted_user_post_count if user[1]<minimum_posts]

In [7]:
def get_edges(post):
    """ 
    Extract all usernames from each post

    Note:
        There can be None, one, or more usernames in a post.
     
     Args:
         post (dict): Yielded by iterate_posts()

    Attributes:
        users (dict): Used to collect usernames 
        
    Returns:
        users: Tuple of usernames from each post. Can contain any number of usernames.
    """    

    users = []
    try:
        for post_item in post['posts']:                
            users.append(post_item['author_username'])
    except KeyError:
        return tuple(users)

In [8]:
# Collect a list of tuples of of length 2 or more usernames from posts
# The exception is triggered by reaching the end of the JSONL file.

edges_temp=[]
all_edges=[]

try:
    while True:
        edges_temp.append(get_edges(next(gen))) 
except:  
    edges_temp=list(set(edges_temp))
    for user_tuple in edges_temp:
        if user_tuple != None and len(user_tuple)>1:
            all_edges.append(user_tuple)
        else:
            continue
    print("Complete")

Complete


In [9]:
def split_edges(all_edges):
    """  
    Split tuples into tuples of length 2 and remove duplicates.

    Args:
        all_edges (list of tuples): These tuples vary in length.

    Attributes: 
        user_list (list): Keeps tuples generated/passed by for loop.
        
    Returns:
        user_list (list of tuples): Each tuple is of length 2 and duplicates are removed.
        
    """

    user_list = []
    
    for user_tuple in all_edges:
        if len(user_tuple)==2:
            user_list.append(user_tuple)
        else:
            for username in user_tuple:
                if username!=user_tuple[0]:
                    user_list.append((user_tuple[0],username))
                else:
                    continue
            continue
    return list(set(user_list))

In [10]:
# Make all elements of edges to be tuples of length 2 with duplicates removed.
edges = split_edges(all_edges)
split = split_edges(all_edges)

In [13]:
temp = []
for user_tuple in edges:
    if user_tuple[0] not in low_posters:
        if user_tuple[1] not in low_posters:
            temp.append(user_tuple)
        else:
            continue
    else:
        continue




In [14]:
print(f'Before removal of low posters: {len(split)}')
print(f'After removal of low posters: {len(temp)}')

Before removal of low posters: 229235
After removal of low posters: 229235


In [None]:
print(f'There are {len(nodes)} individual users.')
print(f'After splitting posts into sets of 2 users, there are {len(split)} sets to check')
print(f'There are {len(low_posters)} users who made less than {minimum_posts} posts who can be removed.')
print(f'That will leave us with {len(edges2)} sets to graph.')

In [None]:
# save the nodes and edges to a file
# TODO: Write code to read in the files to the appropriate variables so that we don't have to run all of the above code again
open('edges_with_low_posters_2_removed.txt', 'w').write('\n'.join('%s %s' % x for x in edges))
open('nodes.txt', 'w').write('\n'.join('%s' % x for x in nodes))

In [None]:
print(f'There are {len(nodes)} nodes in the graph.')
print(f'There are {len(all_edges)} total edges, but this includes duplicates and multiple replies to one post.')
print(f'There are {len(edges)} edges once we split the posts and remove duplicates.')      

## Create and process the graph

In [15]:
import matplotlib.pylab as plt
import networkx as nx

G = nx.Graph()
G.add_nodes_from(nodes)
G.add_edges_from(edges)

In [16]:
G.number_of_nodes()

185207

In [17]:
G.number_of_edges()

228780

In [18]:
S = [G.subgraph(c).copy() for c in nx.connected_components(G)]

In [19]:
S=sorted(S, key=len, reverse=True)

In [21]:
sub=S[0]
# S[1] should have 21 nodes
# S[0] is the big one that takes forever to plot
len(sub)

21

In [None]:
print(f'There are {sub.number_of_nodes()} nodes, and')
print(f'There are {sub.number_of_edges()} edges in the subgraph')

In [None]:
nx.draw(sub, node_size=100, with_labels=True)

[Degree centrality](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.centrality.degree_centrality.html#networkx.algorithms.centrality.degree_centrality)

In [None]:
# TODO: sort this dict by value for readability
dc = nx.degree_centrality(sub)

In [None]:
dc['@zvi20']

[Kernighan–Lin bipartition algorithm](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.kernighan_lin.kernighan_lin_bisection.html#networkx.algorithms.community.kernighan_lin.kernighan_lin_bisection)

In [None]:
from networkx.algorithms.community import kernighan_lin_bisection
klb = kernighan_lin_bisection(sub)

In [None]:
# TODO: adjust the printing of this for readability
klb

[Greedy Modularity Community](https://networkx.org/documentation/stable/reference/algorithms/generated/networkx.algorithms.community.modularity_max.greedy_modularity_communities.html#networkx.algorithms.community.modularity_max.greedy_modularity_communities)

In [None]:
from networkx.algorithms.community import greedy_modularity_communities
gmc = greedy_modularity_communities(sub)

In [None]:
# TODO: adjust the printing of this for readability
gmc

In [None]:
# Save the graph to a GXF file for later use
# Can be imported into Gephi
nx.write_gexf(sub, "subgraph.gexf")