In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from collections import Counter
from pyvis.network import Network

In [2]:
def data_processing(sub, field = "submission", info=False):
    file_name = "data/" + field + "_" + sub + ".csv"
    df = pd.read_csv(file_name,index_col=0)
    if info:
        print("subreddit : " + sub)
        print(df.describe())
        print(df.info())
    return df

In [3]:
def have_bidirectional_relationship(G, node1, node2):
    
    return G.has_edge(node1, node2) and G.has_edge(node2, node1)

In [4]:
def author_extraction(df_submission, df_comment):
    df_comment.dropna(subset=['author'], inplace = True)
    author = dict(zip(df_comment.id, df_comment.author))
    df_submission.dropna(subset=['author'], inplace = True)
    df_submission.reset_index(inplace=True)
    author.update(zip(df_submission.id, df_submission.author))
    return author

In [5]:
def relation_extraction(df, author):
    relation = []
    
    top_level_df = df[df['parent_id'].str.contains("t3_")].dropna()
    not_top_level_df = df[df['parent_id'].str.contains("t1_")].dropna()
    
    for index, row in top_level_df.iterrows():
        try:
            parent_author = author[row["parent_id"][3:]]
            relation.append((row["author"], parent_author))
        except:
            pass
    
    for index, row in not_top_level_df.iterrows():
        try:
            parent_author = author[row["parent_id"][3:]]
            relation.append((row["author"], parent_author))
        except:
            pass
    
    return relation

In [6]:
def network_extraction(sub):
    comment_df = data_processing(sub, field = "comment")
    submission_df = data_processing(sub)
    author_dict = author_extraction(submission_df, comment_df)
    relation = relation_extraction(comment_df, author_dict)
    return relation

In [7]:
def network_to_df(net, threshold = 6):
    counter = Counter(net)
    counter = Counter({k: c for k, c in counter.items() if c >= threshold})
    common_relation = sorted(counter, key=counter.get, reverse=True)
    
    relation_df = pd.DataFrame.from_dict(counter, orient='index').reset_index()
    relation_df[['target','source']] = pd.DataFrame(relation_df['index'].tolist(),index=relation_df.index)
    relation_df = relation_df[relation_df['source'] != relation_df['target']]
    relation_df.drop(['index'], axis = 1, inplace = True)
    relation_df.rename(columns = {0:'weight'}, inplace = True)
    relation_df = relation_df[['source', 'target', 'weight']]
    
    return relation_df

In [55]:
def network_builder(relation_df, threshold = 5, net_type = "simple", sub = None):
    filename = 'output/r_' + sub + '_' + net_type + '.html'
    
    G = nx.from_pandas_edgelist(relation_df, 'source', 'target', 'weight')
    
    for component in list(nx.connected_components(G)):
        if len(component) < threshold:
            for node in component:
                G.remove_node(node)
    
#     net = Network(height="750px", width="100%", bgcolor="#222222", 
#                      font_color="white", notebook = True, cdn_resources = 'remote')

#     # set the physics layout of the network
#     net.barnes_hut()
#     net.from_nx(G)
#     try:
#         net.show(filename)
#     except:
#         pass
    return G

In [9]:
def mutual_network_builder(relation_df, threshold = 10, sub = None):
    filename = 'output/r_' + sub + '_mutual.html'
    
    G = nx.from_pandas_edgelist(relation_df, 'source', 'target', 'weight', 
                                create_using = nx.DiGraph())
        
    biconnections = []

    for u, v, w in G.edges.data("weight"):
        if u > v:
            v, u = u, v
        if have_bidirectional_relationship(G, u, v):
            if u != v:
                biconnections.append((u, v, w))
        
    G_02 = nx.DiGraph()
    G_02.add_nodes_from(relation_df["source"])
    G_02.add_weighted_edges_from(biconnections)
    G_02.remove_nodes_from(list(nx.isolates(G_02)))
    
    for component in list(nx.weakly_connected_components(G_02)):
        if len(component) < threshold:
            for node in component:
                G_02.remove_node(node)
                
    remains = []

    for u, v, w in G_02.edges.data("weight"):
        remains.append((u, v, w))
        
        
    mut_net = Network(height="750px", width="100%", bgcolor="#222222", font_color="white")

    # set the physics layout of the network
    mut_net.barnes_hut()

    for e in remains:
        src = e[0]
        dst = e[1]
        w = e[2]

        mut_net.add_node(src, src, title=src, color = "orange")
        mut_net.add_node(dst, dst, title=dst, color = "orange")
        mut_net.add_edge(src, dst, value=w, color = "orange")

    neighbor_map = mut_net.get_adj_list()

    # add neighbor data to node hover data
    for node in mut_net.nodes:
                    node["title"] += " Neighbors:<br>" + "<br>".join(neighbor_map[node["id"]])
                    node["value"] = len(neighbor_map[node["id"]])

    try:
        mut_net.show(filename)
    except:
        pass


In [80]:
def network_userlist(sub, th_01=2, th_02=3):
    relation = network_extraction(sub)
    relation_df = network_to_df(relation, threshold = th_01)
    G = network_builder(relation_df, threshold = th_02, sub = sub)
    print(relation_df.source.nunique())
    print(G.number_of_nodes())
    return list(G.nodes())

In [93]:
sub_01 = "conservative"
sublist_01 = network_userlist(sub_01)
sublist_01_opt = network_userlist(sub_01, th_01 = 1)

89
57
933
2773


In [94]:
sub_02 = "conspiracy"
sublist_02 = network_userlist(sub_02)
sublist_02_opt = network_userlist(sub_02, th_01 = 1)

136
122
901
3180


In [96]:
sub_03 = "worldnews"
sublist_03 = network_userlist(sub_03, th_01 = 2)
sublist_03_opt = network_userlist(sub_03, th_01 = 1)

13
30
905
4332


In [98]:
sub_04 = "NeutralPolitics"
sublist_04 = network_userlist(sub_04, th_01 = 2)
sublist_04_opt = network_userlist(sub_04, th_01 = 1, th_02 = 10)

346
326
1146
1596


In [105]:
user_list = [sublist_01,sublist_01_opt,
            sublist_02,sublist_02_opt,
            sublist_03,sublist_03_opt,
            sublist_04,sublist_04_opt]

In [110]:
df = pd.Series(user_list)
df.to_csv('user_list.csv', index=False)

In [93]:
relation_df = network_to_df(relation, threshold = 2)
# mutual_network_builder(relation_df, sub = "politics", threshold = 20)