# Subreddit links directed Graph

Source:
https://snap.stanford.edu/data/soc-RedditHyperlinks.html

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import scipy
from networkx.drawing.nx_agraph import graphviz_layout

#Zinoviev, Dmitry. Complex Network Analysis in Python (p. 70). Pragmatic Bookshelf. Kindle Edition. 

Note: There's a bit of dependency hell going on here.  You need scipy 1.8.0 or greater.  I made this using scipy 1.8.1 and networkx version 2.7.1

In [2]:
print(scipy.__version__)
print(nx.__version__)

1.8.1
2.7.1


In [3]:
filename = "soc-redditHyperlinks-body.tsv" #https://snap.stanford.edu/data/soc-RedditHyperlinks.html
df1 = pd.read_csv(filename, sep='\t')

In [4]:
df1.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0...."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0...."


In [5]:
df1.shape

(286561, 6)

In [6]:
def MakeListForProperties(strParam, sepchar=","):
    returnList= []
    position = 0
    count = 0
    for char in strParam:
        #print(count, char, position)
        if (str(char) == sepchar):
            returnList.append(float(strParam[position:count]))
            position = count+1
        count +=1
    returnList.append(strParam[position:])
    return returnList
#print(MakeListForProperties(df1.loc[0,"PROPERTIES"]))

In [7]:
templist = []
for i in (list(df1["PROPERTIES"])):
    templist.append(MakeListForProperties(i))


In [8]:
len(templist)

286561

In [9]:
templist[0:2]

[[345.0,
  298.0,
  0.75652173913,
  0.0173913043478,
  0.0869565217391,
  0.150724637681,
  0.0753623188406,
  57.0,
  53.0,
  10.0,
  4.78947368421,
  15.0,
  0.315789473684,
  1.0,
  1.0,
  345.0,
  57.0,
  35.5778947368,
  0.073,
  0.08,
  0.1748,
  0.3448275862068966,
  0.05172413793103448,
  0.034482758620689655,
  0.0,
  0.034482758620689655,
  0.0,
  0.0,
  0.0,
  0.017241379310344827,
  0.05172413793103448,
  0.10344827586206896,
  0.05172413793103448,
  0.0,
  0.10344827586206896,
  0.0,
  0.034482758620689655,
  0.034482758620689655,
  0.06896551724137931,
  0.017241379310344827,
  0.034482758620689655,
  0.0,
  0.0,
  0.10344827586206896,
  0.0,
  0.0,
  0.0,
  0.05172413793103448,
  0.017241379310344827,
  0.034482758620689655,
  0.0,
  0.0,
  0.017241379310344827,
  0.1896551724137931,
  0.034482758620689655,
  0.0,
  0.034482758620689655,
  0.034482758620689655,
  0.0,
  0.0,
  0.06896551724137931,
  0.05172413793103448,
  0.034482758620689655,
  0.034482758620689655,
  

In [10]:
df1["Properties_Listed"] = templist
df1.head()

Unnamed: 0,SOURCE_SUBREDDIT,TARGET_SUBREDDIT,POST_ID,TIMESTAMP,LINK_SENTIMENT,PROPERTIES,Properties_Listed
0,leagueoflegends,teamredditteams,1u4nrps,2013-12-31 16:39:58,1,"345.0,298.0,0.75652173913,0.0173913043478,0.08...","[345.0, 298.0, 0.75652173913, 0.0173913043478,..."
1,theredlion,soccer,1u4qkd,2013-12-31 18:18:37,-1,"101.0,98.0,0.742574257426,0.019801980198,0.049...","[101.0, 98.0, 0.742574257426, 0.019801980198, ..."
2,inlandempire,bikela,1u4qlzs,2014-01-01 14:54:35,1,"85.0,85.0,0.752941176471,0.0235294117647,0.082...","[85.0, 85.0, 0.752941176471, 0.0235294117647, ..."
3,nfl,cfb,1u4sjvs,2013-12-31 17:37:55,1,"1124.0,949.0,0.772241992883,0.0017793594306,0....","[1124.0, 949.0, 0.772241992883, 0.001779359430..."
4,playmygame,gamedev,1u4w5ss,2014-01-01 02:51:13,1,"715.0,622.0,0.777622377622,0.00699300699301,0....","[715.0, 622.0, 0.777622377622, 0.0069930069930..."


In [11]:
G = nx.DiGraph()

In [12]:
G.add_nodes_from(list(df1["SOURCE_SUBREDDIT"]))

In [13]:
G.add_nodes_from(list(df1["TARGET_SUBREDDIT"]))

In [14]:
edges1=[]
for i in range(0,df1.shape[0]):
    edges1.append((df1.iloc[i,0], df1.iloc[i,1]))
    #if i >312:
        #break
edges1

[('leagueoflegends', 'teamredditteams'),
 ('theredlion', 'soccer'),
 ('inlandempire', 'bikela'),
 ('nfl', 'cfb'),
 ('playmygame', 'gamedev'),
 ('dogemarket', 'dogecoin'),
 ('locationbot', 'legaladvice'),
 ('indiefied', 'aww'),
 ('posthardcore', 'bestof2013'),
 ('posthardcore', 'corejerk'),
 ('gfycat', 'india'),
 ('metalcore', 'bestof2013'),
 ('metalcore', 'corejerk'),
 ('suicidewatch', 'offmychest'),
 ('dogecoin', 'novacoin'),
 ('gaming4gamers', 'fallout'),
 ('kpop', 'dota2'),
 ('airsoft', 'airsoftmarket'),
 ('circlebroke', 'childfree'),
 ('tribes', 'games'),
 ('oldschoolcoolnsfw', 'pics'),
 ('fl_vapers', 'vaperequests'),
 ('jailbreak', 'flextweak'),
 ('corejerk', 'bestof2013'),
 ('iama', 'todayilearned'),
 ('bandnames', 'books'),
 ('thedoctorstravels', 'hungergamesrp'),
 ('politicaldiscussion', 'todayilearned'),
 ('uncomfortableqs', 'debatereligion'),
 ('connecticut', 'ctbeer'),
 ('metafitnesscirclejerk', 'fitnesscirclejerk'),
 ('srssucks', 'funny'),
 ('thehiddenbar', 'writingprompts'

In [15]:
len(edges1)

286561

In [16]:
len(set(edges1))

137821

In [17]:
G.add_edges_from(set(edges1))

In [18]:
len(set(G.nodes()))

35776

In [19]:
#pos = graphviz_layout(G)
#Zinoviev, Dmitry. Complex Network Analysis in Python (p. 70). Pragmatic Bookshelf. Kindle Edition. 

In [20]:
#nx.draw_networkx(G)

##This takes hours to run and it's trash!!
#this prints with labels and looks like a mess, useless

In [22]:
#nx.draw_networkx(G, with_labels=False)
##This takes hours to run and it's trash!!

## Write File for Gephi

In [23]:
with open("Subreddits2.graphml", "wb") as ofile:
    nx.write_graphml(G, ofile)