Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

cleaning up

  • Loading branch information...
commit b9074b5dd063ce27e8448076a9651953feef4441 1 parent eeec391
mat kelcey authored
3  .gitignore
View
@@ -1,3 +1,4 @@
+*png
current_jobflow_id
distanceToPhilosophy/.classpath
distanceToPhilosophy/.project
@@ -6,4 +7,4 @@ distanceToPhilosophy/bin/
wikimedia/egs_actual
freebase/egs_actual
DistanceToPhilosophy.stderr
-DistanceToPhilosophy.stdout
+DistanceToPhilosophy.stdout
45 README
View
@@ -13,10 +13,9 @@ wait
# move xml for articles and redirects into hdfs
hadoop fs -mkdir /full/articles.xml
-hadoop fs -copyFromLocal enwiki-20110722-pages-articles.xml /full/articles.xml &
+hadoop fs -copyFromLocal enwiki-20110722-pages-articles.xml /full/articles.xml
hadoop fs -mkdir /full/redirects.xml
-hadoop fs -copyFromLocal enwiki-20110722-pages-redirects.xml /full/redirects.xml &
-wait
+hadoop fs -copyFromLocal enwiki-20110722-pages-redirects.xml /full/redirects.xml
# parse redirects
cd
@@ -41,28 +40,34 @@ hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig
# as a sanity check, should be same
-# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced2 -f dereference_redirects.pig
+# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced.sanity -f dereference_redirects.pig
# get to local filesystem
-hadoop fs -cat /full/edges.dereferenced/* > data/edges
+hadoop fs -cat /full/edges.dereferenced/* > edges
+# hadoop fs -cat /full/edges.dereferenced.sanity/* > edges.sanity
-# inject special cases
-cat special_edges_cases >> data/edges
+# add special cases, the parser will, alas, never be perfect...
+cat manually_derived_edges >> edges
# calculate distance from Philosophy
-java -Xmx8g -cp . DistanceToPhilosophy Philosophy data/edges >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr
-
-# visited vs non visited
-grep ^FINAL DistanceToPhilosophy.stdout | wc -l
-grep ^didnt\ visit DistanceToPhilosophy.stdout | wc -l
-
-# work out which nodes we didn't visit
-grep ^didnt DistanceToPhilosophy.stdout | sed -es/didnt\ visit\ // > didnt_visit
-# summarise why we didn't visit them
-./walk_till_end.py < didnt_visit > walk_till_end.stdout
-grep end\ of\ line$ walk_till_end.stdout | cut -f2 | sort | uniq -c | sort -nr | head
+java -Xmx8g -cp distanceToPhilosophy/bin/ DistanceToPhilosophy \
+ Philosophy edges \
+ >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr
+
+# order nodes by their number of descendants
+grep ^FINAL DistanceToPhilosophy.stdout | sed -es/FINAL\ // > distances
+cut -f1 distances > articles
+hfs -mkdir /articles_that_led_to_philosophy
+hfs -copyFromLocal articles /articles_that_led_to_philosophy
+hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
+ -input /articles_that_led_to_philosophy -output /num_descendants \
+ -mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
+ -reducer aggregate
+hfs -cat /num_descendants/* | sort -k2 -t" " -nr > descendants.sorted
+# draw graph of the top 1000
+head -n 200 descendants.sorted > descendants.top200
+./filter_nodes.py descendants.top200 < edges > filtered.edges.top200
+./to_dot.py filtered.edges.top200 descendants.top200 | dot -Tpng > top200.png
-San Jose InternationalSeattle-Tacoma International COMcast__11
-tV_s6ALJ
2  articleParser.py
View
@@ -93,6 +93,8 @@ def valid_link(link, title):
if link:
print (title+"\t"+link.strip()).encode('utf-8')
+ else:
+ sys.stderr.write("reporter:counter:parse,no_link,1\n")
except:
sys.stderr.write("reporter:counter:parse,exception_parsing_article,1\n")
31 count_descendants.py
View
@@ -0,0 +1,31 @@
+#!/usr/bin/env python
+import fileinput, re, sys
+
+edges = {}
+for line in open(sys.argv[1],'r').readlines():
+ from_node, to_node = line.strip().split("\t")
+ edges[from_node] = to_node
+
+for node in sys.stdin:
+ node = node.strip()
+ last_node = node
+ visited = set()
+
+ while (node!=None):
+ sys.stdout.write("LongValueSum:" + node + "\t1\n")
+ if node == 'Philosophy':
+ break
+
+ node = edges.get(node)
+
+ if node == None:
+ sys.stderr.write("reporter:counter:path,unexpected_end_of_line,1\n")
+ break
+ elif node in visited:
+ sys.stderr.write("reporter:counter:path,unexpected_cycle,1\n")
+ break
+ else:
+ visited.add(node)
+ last_node = node
+
+
23 explorer.py
View
@@ -7,29 +7,34 @@
from_node, to_node = line.strip().split("\t")
edges[from_node] = to_node
-dot_file = open('graph.dot','w')
-dot_file.write("digraph {\n")
+#dot_file = open('graph.dot','w')
+#dot_file.write("digraph {\n")
while True:
node = raw_input("node? ")
- last_node = node
-
if node=='':
break
+ last_node = node
visited = set()
+
while not(node == 'Philosophy' or node == None):
print node,"->",
node = edges.get(node)
- if node:
- dot_file.write('"'+last_node+'"->"'+node+'\"\n')
- last_node = node
+ if node in visited:
+ print node,"CYCLE"
+ node = None
+ else:
+ visited.add(node)
+# if node:
+# dot_file.write('"'+last_node+'"->"'+node+'\"\n')
+ last_node = node
if (node=='Philosophy'):
print node
else:
print "DONE"
-dot_file.write("}\n")
-dot_file.close()
+#dot_file.write("}\n")
+#dot_file.close()
17 filter_nodes.py
View
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+# filter nodes from edges
+import fileinput, re, sys
+
+allowed_nodes = set()
+for line in open(sys.argv[1],'r').readlines():
+ node, freq = line.strip().split("\t")
+ allowed_nodes.add(node)
+
+for edge in sys.stdin:
+ edge = edge.strip()
+ from_node, to_node = edge.split("\t")
+ if from_node in allowed_nodes and to_node in allowed_nodes:
+ print edge
+
+
+
153 journal
View
@@ -1041,3 +1041,156 @@ elastic-mapreduce -c ~/security/credentials.json --create --alive \
--bootstrap-action s3://elasticmapreduce/bootstrap-actions/configurations/latest/memory-intensive \
--bootstrap-action s3://mkelcey/wikipediaPhilosophy/install_beautiful_soup_and_mwlib.sh \
--pig-interactive --name mkelcey_1313088555
+
+articleParser
+Map input records 6,293,014
+exception_parsing_article 166
+no_link 34,440
+ignore_meta_title 2,569,682
+neg_parantheses_depth 1,914
+Map output records 3,688,729
+
+visited 3,540,120
+didnt visit 185,252
+
+hadoop@ip-10-17-182-55:~/wikipediaPhilosophy$ grep end\ of\ line$ walk_till_end.stdout | cut -f2 | sort | uniq -c | sort -nr | head
+ 35022 India
+ 3393 Local development ministry (no wiki page)
+ 1620 Iraq
+ 1529 Azana (gnat)
+ 739 Professional sports
+ 561 Uzbekistan
+ 479 Bhutan
+ 416 Tajikistan
+ 349 Borsod-Aba???
+ 313 Mental state
+
+looks like there's always going to be special cases...
+India -> South Asia
+Iraq -> Western Asia
+Professional sports -> Amateur sports
+Uzbekistan -> Landlocked country
+Bhutan -> Landlocked country
+Tajikistan -> Landlocked country
+Mental state -> Psychology
+
+brings it down a bit
+visited 3579277
+didnt visit 146095
+
+grep end\ of\ line$ walk_till_end.stdout | cut -f2 | sort | uniq -c | sort -nr | head
+ 3393 Local development ministry
+ 1529 Azana (gnat)
+ 349 Borsod-Abaúj-Zemplén County
+ 312 Cell biology
+ 274 Russian Soviet Federative Socialist Republic
+ 268 Discing
+ 251 Assyrian people
+ 247 German Navy
+ 236 Indoor American football
+ 225 March 21 (Eastern Orthodox liturgics)
+
+getting deminising returns...
+
+cut -f4 walk_till_end.stdout|sort|uniq -c
+ 51362 cycle
+ 58097 end of line
+ 36636 no node
+
+grep no\ node$ walk_till_end.stdout | cut -f2 | sort | uniq -c
+ 36636 NA
+
+-- looking at frequencies
+
+cut -f2 distances | uniq -c | perl -plne's/^\s+//' > distance.freqs
+
+dists = read.delim('distance.freqs', header=FALSE, sep=" ")
+names(dists) <- c("freq","distance")
+library(ggplot2)
+ggplot(head(dists, 70), aes(x=distance, y=freq)) +
+ geom_line() + geom_point() +
+ xlab('number of clicks away from philosophy') + ylab('number of articles') +
+ opts(title='distance to philosophy')
+
+--------------------------------
+-- blog notes
+
+general notes to add at start
+- wikipedia is under heavy edit churn. i've been doing this project in 15-20 minutes chunks for a few weeks and it's amazing
+ how often i'd compare the parsing to live wikipedia and find out the page had already subtely changed.
+- i wrote all the code for this in python (trying to move away from ruby to get better data related library support) _except_ for
+the depth first search which i did in java. a 3e6 node dict was _insanely_ slow to access, i must be doing something wrong...
+
+
+to calculate the distance from philosophy for all terms it's a straight forward breadth first search,
+and because this search doesnt <a href="http://en.wikipedia.org/wiki/Graph_cycle">cycle</a> back to Philosophy again it ends
+up building a <a href="http://en.wikipedia.org/wiki/Tree_(graph_theory)">tree</a>
+
+we can start answering some of our original questions now..
+
+1) there are 3,500,000 articles that lead to philosophy but there are 100,000 articles that don't. these articles fail into a number of sub categories
+
+- cycles; 50,000 articles end up getting are stuck in cycles which is remarkably low given 3,500,000 make it to philosophy.
+
+the vast majority of the cycles are two nodes; eg Waste management -> Waste collection -> Waste management
+
+my favorite that i stumbled across is 'Sand fence -> Snow fence -> Sand fence'
+the first sentence of Snow fence being "A snow fence is a structure, similar to a sand fence ..."
+the first sentence of Sand fence being "A sand fence is a structure similar to a snow fence ..."
+
+- another 50k are dead ends; all sorts of examples for this, mainly around pages that were never written or have been deleted;
+eg Windsurfing -> Surface water sports -> Discing (which was deleted)
+
+so
+
+Philosophy -> Reason -> Natural science -> Science -> Knowledge -> Fact -> Information -> Sequence -> Mathematics -> Quantity -> Property (philosophy) -> Modern philosophy -> Philosophy
+
+2) of the articles that do lead to philosophy here's a graph showing the distribution of their distances
+the bulk are defintely between 10 to 30 clicks away.
+
+<img src="num_articles__num_clicks__philosophy.png" />
+
+i've trimmed this graph at 70 clicks away since there's a long tail of one single path that is 1001 clicks away
+
+List of state leaders in 1977 -> List of state leaders in 1976 -> List of state leaders in 1975 ->
+.... -> List of state leaders in 1001 -> List of state leaders in 1000 -> Fatimid Caliphate -> Arab people
+-> Panethnicity -> Ethnic group -> Social group
+-> Social sciences -> List of academic disciplines -> Academia -> Community -> Living -> Life
+-> Physical body -> Physics -> Natural science -> Science -> Knowledge -> Fact -> Information
+-> Sequence -> Mathematics -> Quantity -> Property (philosophy) -> Modern philosophy -> Philosophy
+
+The longest one I found that didn't include a chain of lists is Violet & Daisy which is 37 articles long.
+
+Violet & Daisy -> Saoirse Ronan -> BAFTA Award for Best Actress in a Supporting Role -> British Academy Film Awards -> British Academy of Film and Television Arts -> David Lean -> Order of the British Empire -> Chivalric order -> Knight -> Warrior -> Combat -> Violence -> Psychological manipulation -> Social influence -> Conformity -> Unconscious mind -> Germans -> Germanic peoples -> Proto-Indo-Europeans -> Proto-Indo-European language -> Linguistic reconstruction -> Internal reconstruction -> Language -> Human -> Extant taxon -> Biology -> Natural science -> Science -> Knowledge -> Fact -> Information -> Sequence -> Mathematics -> Quantity -> Property (philosophy) -> Modern philosophy -> Philosophy
+considering the first fews depths of the tree
+
+distance freq comment
+0 1 Philosophy itself
+1 1084 Articles that are one click away from Philosophy
+2 1535 Articles that are two clicks away
+
+-- so the idea is it's not even, there are trodden paths
+
+# calculate number of descendants for each node in tree from philosophy
+cut -f1 distances > articles
+hfs -mkdir /articles_that_led_to_philosophy
+hfs -copyFromLocal articles /articles_that_led_to_philosophy
+hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
+ -input /articles_that_led_to_philosophy -output /num_descendants \
+ -mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
+ -reducer aggregate
+hfs -cat /num_descendants/* | sort -k2 -t" " -nr > descendants.sorted
+
+# draw graph of the top 1000
+head -n 10 descendants.sorted > descendants.top10
+./filter_nodes.py descendants.top10 < edges > filtered.edges.top10
+./to_dot.py filtered.edges.top10 descendants.top10 | dot -Tpng > blah.png
+
+
+top200 zoom it
+<script src="http://zoom.it/adTw.js?width=auto&height=400px"></script>
+
+top1000 zoom it
+<script src="http://zoom.it/QyGA.js?width=auto&height=400px"></script>
+
+Philosophy -> Reason -> Natural science -> Science -> Knowledge -> Fact -> Information -> Sequence -> Mathematics -> Quantity -> Property (philosophy) -> Modern philosophy -> Philosophy
7 manually_derived_edges
View
@@ -0,0 +1,7 @@
+India South Asia
+Iraq Western Asia
+Professional sports Amateur sports
+Uzbekistan Landlocked country
+Bhutan Landlocked country
+Tajikistan Landlocked country
+Mental state Psychology
12 process_redirects.sh
View
@@ -0,0 +1,12 @@
+set -ex
+
+hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
+ -input /full/redirects.xml -output /full/redirects \
+ -mapper redirectParser.py -file redirectParser.py
+
+pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
+hfs -mv /full/redirects /full/redirects.original
+hfs -mv /full/redirects.dereferenced4 /full/redirects
42 runArticleParsing.sh
View
@@ -1,18 +1,42 @@
#!/usr/bin/env bash
-set -x
+set -ex
+
+cd ~/data
+
+wget http://download.wikimedia.org/enwiki/20110722/enwiki-20110722-pages-articles.xml.bz2
+
+bzcat enwiki-20110722-pages-articles.xml.bz2 | ~/wikipediaPhilosophy/flattenToOnePagePerLine.py > enwiki-20110722-pages-articles.pageperline.xml
+
+cat enwiki-20110722-pages-articles.pageperline.xml | grep \<redirect\ \/\> > enwiki-20110722-pages-redirects.xml &
+cat enwiki-20110722-pages-articles.pageperline.xml | grep -v \<redirect\ \/\> > enwiki-20110722-pages-articles.xml &
+
+hadoop fs -mkdir /full/articles.xml
+hadoop fs -mkdir /full/redirects.xml
+hadoop fs -copyFromLocal enwiki-20110722-pages-articles.xml /full/articles.xml &
+hadoop fs -copyFromLocal enwiki-20110722-pages-redirects.xml /full/redirects.xml &
+wait
+
+cd ~/wikipediaPhilosophy
+
+hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
+ -input /full/redirects.xml -output /full/redirects \
+ -mapper redirectParser.py -file redirectParser.py
+
+pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
+pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
+hfs -mv /full/redirects /full/redirects.original
+hfs -mv /full/redirects.dereferenced4 /full/redirects
-hadoop fs -rmr /full/edges
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
- -input /full/articles.xml -output /full/edges \
- -mapper articleParser.py -file articleParser.py
+ -input /full/articles.xml -output /full/edges \
+ -mapper articleParser.py -file articleParser.py
-hadoop fs -rmr /full/edges.dereferenced
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig
+pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced2 -f dereference_redirects.pig # sanity
hadoop fs -cat /full/edges.dereferenced/* > data/edges
-time java -Xmx8g -cp . DistanceToPhilosophy Philosophy data/edges >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr
+java -Xmx8g -cp . DistanceToPhilosophy Philosophy data/edges >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr
-grep ^didnt DistanceToPhilosophy.stdout | sed -es/didnt\ visit\ // > didnt_visit
-./walk_till_end.py < didnt_visit >walk_till_end.stdout 2>walk_till_end.stderr
-grep end\ of\ line$ walk_till_end.stdout | cut -f2 | sort | uniq -c | sort -nr | head
27 simple_parse.py
View
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+from mwlib.uparser import parseString, simpleparse
+from BeautifulSoup import BeautifulStoneSoup
+from xml.sax.saxutils import unescape
+from mwlib.parser.nodes import *
+import fileinput
+import re
+
+def replace_nested(regex, text):
+ while True:
+ original = text
+ text = re.sub(regex, ' ', text)
+# print "text2 ", text[0:1000].encode('utf-8')
+ if original == text:
+ return text
+
+for line in fileinput.input():
+ xml = BeautifulStoneSoup(line)
+ text = xml.find('text').string
+ text = unescape(text, {"&apos;": "'", "&quot;": '"'})
+ text = replace_nested('{[^{]*?}', text)
+ simpleparse(text)
+
+
+
+
+
30 to_dot.py
View
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+import sys
+
+edges_file = sys.argv[1]
+descendants_file = sys.argv[2]
+
+weights = {}
+for node_weight in open(descendants_file,'r').readlines():
+ node, weight = node_weight.strip().split("\t")
+ weights[node] = int(weight)
+
+minw = min(weights.values())
+delta = max(weights.values()) - minw
+penwidths = {}
+#print "weights", weights
+for node in weights.keys():
+ normalised = float(weights[node] - minw) / delta
+ weights[node] = (normalised * 10000) + 1
+ penwidths[node] = (normalised * 30) + 1
+#print "weights", weights
+
+print "digraph {"
+print 'rankdir="LR"'
+print 'graph [ truecolor bgcolor="#00000000" ];'
+print 'node [ style=filled ];'
+for edge in open(edges_file,'r').readlines():
+ from_node, to_node = edge.strip().split("\t")
+# print '"' + from_node + '" -> "' + to_node + '" [ penwidth= ' + str(penwidths[from_node])+ '];'
+ print '"' + from_node + '" -> "' + to_node + '" [ weight = ' + str(weights[from_node]) + ', penwidth= ' + str(penwidths[from_node])+ '];'
+print "}"
Please sign in to comment.
Something went wrong with that request. Please try again.