In [None]:
import networkx as nx
import pandas as pd
import ast
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv('../../../data/tweets.csv')

In [None]:
data = data[['user_key', 'mentions', 'created_at']]
len(data)

In [None]:
def createGraph(data, directed=False, only_trolls=False):
    nodes = {}
    G = nx.Graph()
    trolls = set(list(data['user_key']))
    if directed:
        G = nx.DiGraph()
    for index, row in data.iterrows():
        if index % 10000 == 0:
            print(index)
        init_node = row['user_key']
        if init_node not in nodes:
            nodes[init_node] = {'start_date': row['created_at'], 'end_date': row['created_at']}
            G.add_node(init_node, start_date=row['created_at'], end_date=row['created_at'])
        else:
            node_start_date = float(nodes[init_node]['start_date'])
            node_end_date = float(nodes[init_node]['end_date'])
            new_date = float(row['created_at'])
            if new_date < node_start_date:
                nx.set_node_attributes(G, {init_node: {'start_date':row['created_at']}})
            if new_date > node_end_date:
                nx.set_node_attributes(G, {init_node: {'end_date':row['created_at']}})
        mentioned_list = ast.literal_eval(row['mentions'])
        for user in mentioned_list:
            if user not in nodes and (not only_trolls or user in trolls):
                nodes[user] = {'start_date': row['created_at'], 'end_date': row['created_at']}
                G.add_node(user, start_date=row['created_at'], end_date=row['created_at'])
            elif user in nodes:
                node_start_date = float(nodes[user]['start_date'])
                node_end_date = float(nodes[user]['end_date'])
                new_date = float(row['created_at'])
                if new_date < node_start_date:
                    nx.set_node_attributes(G, {user: {'start_date':row['created_at']}})
                if new_date > node_end_date:
                    nx.set_node_attributes(G, {user: {'end_date':row['created_at']}})
            if not only_trolls or user in trolls:
                G.add_edge(init_node, user, date=row['created_at'])
    return G

In [None]:
def getLargestWCC(G):
    nodes_count = len(list(G.nodes))
    edges_count = len(list(G.edges))
    G.remove_nodes_from(list(nx.isolates(G)))
    largest_cc = max(nx.connected_component_subgraphs(G), key=len)
    print('Preserved nodes ', len(list(largest_cc.nodes))/nodes_count)
    print('Preserved edges ', len(list(largest_cc.edges))/edges_count)
    return largest_cc

In [None]:
def getSubsetofData(data, largest_cc):
    nodes_in_cc = set(list(largest_cc.nodes))
    added_nodes = {}
    indexes_remove = []
    for index, row in data.iterrows():
        init_node = row['user_key']
        found_in_cc = False
        if init_node in nodes_in_cc:
            continue
        mentioned_list = ast.literal_eval(row['mentions'])
        for user in mentioned_list:
            if user in nodes_in_cc:
                found_in_cc = True
                break
        if found_in_cc:
            continue
        indexes_remove.append(index)

    return data.drop(indexes_remove)

In [None]:
def plotGraph(G, trolls_list, real_list, title='graph', no_labels=True):
    fig = plt.figure(figsize=(80, 80))
    pos = nx.spring_layout(G)
    nx.draw_networkx_nodes(G, pos, nodelist=real_list, node_color='g', node_size=50)
    nx.draw_networkx_nodes(G, pos, nodelist=trolls_list, node_color='r', node_size=100)
    nx.draw_networkx_edges(G, pos, width=0.5, alpha=0.5)
    if not no_labels:
        nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold', font_color="b")
    plt.axis('equal') 
    plt.show()
    fig.savefig(title + '.svg')
    fig.savefig(title + '.png') 

# Undirected Graph (Largest WCC)

In [None]:
G = createGraph(data)
len(list(G.edges))
len(list(G.nodes))

In [None]:
nx.get_node_attributes(G, 'start_date')['giselleevns']

In [None]:
G = createGraph(data)
G = getLargestWCC(G)
trolls_list = []
real_list = []
all_trolls = set(list(data['user_key']))
for node in list(G.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
plotGraph(G, trolls_list, real_list, title='undirectedGraph')

## Only trolls

In [None]:
G = createGraph(data, only_trolls=True)
G = getLargestWCC(G)
print(len(list(G.nodes)), len(list(G.edges)))
trolls_list = []
real_list = []
all_trolls = set(list(data['user_key']))
for node in list(G.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
plotGraph(G, trolls_list, real_list, title='only_trolls_undirected_labels_wcc', no_labels=False)

# Directed Graph (Largest WCC)

In [None]:
G = createGraph(data)
G = getLargestWCC(G)
wcc_data = getSubsetofData(data, G)
G = createGraph(wcc_data, directed=True)
trolls_list = []
real_list = []
all_trolls = set(list(wcc_data['user_key']))
for node in list(G.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
plotGraph(G, trolls_list, real_list, title='directedGraph')

## Only trolls

In [None]:
G = createGraph(data, only_trolls=True)
G = getLargestWCC(G)
wcc_data = getSubsetofData(data, G)
G = createGraph(wcc_data, directed=True, only_trolls=True)
print(len(list(G.nodes)), len(list(G.edges)))
trolls_list = []
real_list = []
all_trolls = set(list(wcc_data['user_key']))
for node in list(G.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
plotGraph(G, trolls_list, real_list, title='only_trolls_directed_labels_wcc', no_labels=False)

# Things to do
* Look at the most connected node
* Degrees of separation
* What about the other WCCs?
* Any fake users vs real users disparities?
* How much does the largest WCC account for in terms of nodes and edges?

In [None]:
def subtractGraph(full, sub):
    print(len(full.nodes))
    remove_nodes = []
    for node in full.nodes:
        if node in sub:
            remove_nodes.append(node)
    full.remove_nodes_from(remove_nodes)
    print(len(full.nodes))
    return full

In [None]:
#Network of trolls not connected to the trolls' largest WCC
G_full = createGraph(data, only_trolls=True)
G_WCC = getLargestWCC(G_full)
G_full = createGraph(data, only_trolls=True)
G_other = subtractGraph(G_full, G_WCC)
other_data = getSubsetofData(data, G_other)
G_other = createGraph(other_data, directed=True)
trolls_list = []
real_list = []
all_trolls = set(list(other_data['user_key']))
for node in list(G_other.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
plotGraph(G_other, trolls_list, real_list, title='newThing_test', no_labels=False)

In [None]:
G = createGraph(data)
G = getLargestWCC(G)
wcc_data = getSubsetofData(data, G)
G_WCC = createGraph(wcc_data, directed=True)
G = createGraph(data, only_trolls=True)
G = getLargestWCC(G)
wcc_data = getSubsetofData(data, G)
G_WCC_trolls = createGraph(wcc_data, only_trolls=True, directed=True)
G = createGraph(data)
G_trolls = createGraph(data, only_trolls=True)
trolls_list = []
real_list = []
all_trolls = set(list(data['user_key']))
for node in list(G.nodes):
    if node in all_trolls:
        trolls_list.append(node)
    else:
        real_list.append(node)
# plotGraph(G, trolls_list, real_list, title='only_trolls_undirected_labels_wcc', no_labels=False)

In [None]:
import json
data = {"nodes": [], "links": []}
start_dates=nx.get_node_attributes(G, 'start_date')
end_dates=nx.get_node_attributes(G, 'end_date')
edge_dates=nx.get_edge_attributes(G, 'date')
for node in list(G.nodes):
    group_1 = 'real'
    if node in all_trolls:
        group_1 = 'troll'
    if G.degree(node) > 1:
    data['nodes'].append({'id': node, 'type': group_1, 'count': G.degree(node),
                          'start_date': start_dates[node], 'end_date': end_dates[node],
                          ''})
for edge in list(G.edges):
    group_1 = 'real'
    group_2 = 'minor'
    if edge[0] in all_trolls and edge[1] in all_trolls:
        group_1 = 'troll'
    if G.degree(edge[0]) > 1 and G.degree(edge[1]) > 1:
        group_2 = 'major'
    data['links'].append({"source": edge[0], "target": edge[1], "value": 1, "group_1": group_1, "group_2": group_2,})
with open('graph_data.json', 'w') as outfile:
    json.dump(data, outfile)

In [None]:
G = createGraph(data)
G = getLargestWCC(G)
wcc_data = getSubsetofData(data, G)
G = createGraph(wcc_data, directed=True)

In [None]:
len(list(G.nodes))

In [None]:
counter = 0
little_mention = 0
for node in G.nodes:
    if node not in all_trolls:
        counter += 1
        if G.degree(node) < 2:
            little_mention += 1
print(counter)
print(little_mention)

troll v real user, troll in the largest wcc, non-trolls with 2+ mentions, overview of all (not important)