# Imports and Mounting

In [1]:
#Imports
%matplotlib widget
import pandas as pd
import numpy as np
import csv
import matplotlib
import matplotlib.pyplot as plt
import scipy
import datetime
import networkx as nx

# Misc Methods

In [2]:
def head(a: dict, i: int) -> dict:
  return dict(list(a.items())[0:i])

In [3]:
def drawColored(G: nx.Graph, coloring: dict):
  #Create drawable color_list from coloring in the right order
  color_list = []
  for node in G.nodes:
    color_list.append(coloring.get(node))

  #Draw colored graph with color_list
  plt.figure()
  nx.draw(G, node_color=color_list, with_labels=True)
  plt.show()

In [4]:
def numberOfColors(coloring: dict) -> int:
  #Create drawable color_list from coloring in the right order
  color_list = []
  for key in coloring.keys():
    color_list.append(coloring.get(key))
  return len(set(color_list))

In [5]:
#Read reddit text file data
def readRedditData():
    file = open('data/reddit_edges.txt', 'r')
    edges = []
    for line in file.readlines():
        edges.append((line.split()[0], line.split()[1]))
    file.close()
    file = open('data/reddit_vertices.txt', 'r')
    vertices = []
    for line in file.readlines():
        vertices.append(line)
    file.close()

# Data acquisition

In [6]:
# Read the file from drive
# File can be downloaded here: https://snap.stanford.edu/data/#temporal
df = pd.read_csv('bigdata/soc-redditHyperlinks-body.tsv', delimiter="\t")

In [7]:
#Some pandas filtering of the data
dfBasic = df.drop(['POST_ID', 'TIMESTAMP', 'LINK_SENTIMENT', 'PROPERTIES'], axis=1)
dfEdges = dfBasic.drop_duplicates()

#Extract pandas data to simple lists
sources = dfEdges['SOURCE_SUBREDDIT'].tolist()
targets = dfEdges['TARGET_SUBREDDIT'].tolist()

In [8]:
#Get all vertices
vertices = list(set.union(set(sources), set(targets)))
len(vertices)

35776

In [9]:
print(len(vertices))
vertices[0:20]

35776


['glitch_art',
 'tuscaloosa',
 'manwhorepodcast',
 'simplebanking',
 'ryan',
 'stunfisk',
 'trillek',
 'nsfw_korea',
 'dubaiclassifieds',
 'basingstoke',
 'gtd',
 'ideasfortifu',
 'nmsportals',
 'trump',
 'destionationtest',
 'littlepersonals',
 'wisconsingo',
 'redditpersonality',
 'vapewild',
 'letsencrypt']

In [10]:
#Get all edges
edges = []
for i in range (0, len(sources)):
  tuple = (sources[i], targets[i])
  edges.append(tuple)

In [11]:
#Remove inversed tuples 
#since our graph will not be directional, networkx would just ignore these tuples
from more_itertools import unique_everseen
edges = list(unique_everseen(edges, key=frozenset))

In [12]:
print(len(edges))
edges[0:20]

124330


[('leagueoflegends', 'teamredditteams'),
 ('theredlion', 'soccer'),
 ('inlandempire', 'bikela'),
 ('nfl', 'cfb'),
 ('playmygame', 'gamedev'),
 ('dogemarket', 'dogecoin'),
 ('locationbot', 'legaladvice'),
 ('indiefied', 'aww'),
 ('posthardcore', 'bestof2013'),
 ('posthardcore', 'corejerk'),
 ('gfycat', 'india'),
 ('metalcore', 'bestof2013'),
 ('metalcore', 'corejerk'),
 ('suicidewatch', 'offmychest'),
 ('dogecoin', 'novacoin'),
 ('gaming4gamers', 'fallout'),
 ('kpop', 'dota2'),
 ('airsoft', 'airsoftmarket'),
 ('circlebroke', 'childfree'),
 ('tribes', 'games')]

NameError: name 'test' is not defined

# Networkx greedy coloring and drawing

In [None]:
#Create empty graph
G = nx.Graph()

In [None]:
#Add vertices and edges
#Note that adding edges between vertices not yet in the graph will add those vertices

#G.add_nodes_from(vertices)
G.add_edges_from(edges[0:40])

print(len(G.nodes))
print(len(G.edges))

In [None]:
#Compute coloring
coloring = nx.coloring.greedy_color(G, strategy=nx.coloring.strategy_largest_first)
head(coloring, 20)

In [None]:
numberOfColors(coloring)

In [None]:
#Draw graph with colors
drawColored(G, coloring)