do all first links on wikipedia _really_ lead to philosophy?
Clone or download
Fetching latest commit…
Cannot retrieve the latest commit at this time.
Permalink
Type Name Latest commit message Commit time
Failed to load latest commit information.
article.egs
distanceToPhilosophy/src
freebase
.gitignore
README
article_parser.py
count_descendants.py
dereference_redirects.pig
egs_actual
egs_expected
explorer.py
filter_nodes.py
flatten_to_one_page_per_line.py
install_beautiful_soup_and_mwlib.sh
linktastic.py
manually_derived_edges
process_redirects.sh
redirect_parser.py
run_article_parsing.sh
simple_parse.py
test_article_parser.sh
to_dot.py
walk_till_end.py

README

see http://matpalm.com/blog/2011/08/13/wikipedia-philosophy

-----

# get data
cd data
wget http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2   # 9gb

# flatten to single line
bzcat enwiki-latest-pages-articles.xml.bz2 | ~/flatten_to_one_page_per_line.py > enwiki-latest-pages-articles.pageperline.xml # 30gb

# split into redirects and articles
cat enwiki-latest-pages-articles.pageperline.xml | grep \<redirect\ \/\> > enwiki-latest-pages-redirects.xml &   
cat enwiki-latest-pages-articles.pageperline.xml | grep -v \<redirect\ \/\> > enwiki-latestXS-pages-articles.xml & 
wait

# move xml for articles and redirects into hdfs 
hadoop fs -mkdir /full/articles.xml
hadoop fs -copyFromLocal enwiki-latest-pages-articles.xml /full/articles.xml
hadoop fs -mkdir /full/redirects.xml
hadoop fs -copyFromLocal enwiki-latest-pages-redirects.xml /full/redirects.xml

# parse redirects
cd
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/redirects.xml -output /full/redirects \
 -mapper redirect_parser.py -file redirect_parser.py

# dereference all redirects to their final targets
pig -p INPUT=/full/redirects -p OUTPUT=/full/redirects.dereferenced1 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced1 -p OUTPUT=/full/redirects.dereferenced2 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced2 -p OUTPUT=/full/redirects.dereferenced3 -f dereference_redirects.pig
pig -p INPUT=/full/redirects.dereferenced3 -p OUTPUT=/full/redirects.dereferenced4 -f dereference_redirects.pig
hfs -mv /full/redirects /full/redirects.original
hfs -mv /full/redirects.dereferenced4 /full/redirects

# run extraction
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /full/articles.xml -output /full/edges \
 -mapper article_parser.py -file article_parser.py

# run redirects against edges
pig -p INPUT=/full/edges -p OUTPUT=/full/edges.dereferenced -f dereference_redirects.pig

# as a sanity check, should be same
# pig -p INPUT=/full/edges.dereferenced -p OUTPUT=/full/edges.dereferenced.sanity -f dereference_redirects.pig 

# get to local filesystem
hadoop fs -cat /full/edges.dereferenced/* > edges
# hadoop fs -cat /full/edges.dereferenced.sanity/* > edges.sanity

# add special cases, the parser will, alas, never be perfect...
cat manually_derived_edges >> edges

# calculate distance from Philosophy
java -Xmx8g -cp distanceToPhilosophy/bin/ DistanceToPhilosophy \
 Philosophy edges \
 >DistanceToPhilosophy.stdout 2>DistanceToPhilosophy.stderr

# order nodes by their number of descendants
grep ^FINAL DistanceToPhilosophy.stdout | sed -es/FINAL\ // > distances
cut -f1 distances > articles
hfs -mkdir /articles_that_led_to_philosophy
hfs -copyFromLocal articles /articles_that_led_to_philosophy
hadoop jar ~/contrib/streaming/hadoop-streaming.jar \
 -input /articles_that_led_to_philosophy -output /num_descendants \
 -mapper 'count_descendants.py edges' -file count_descendants.py -file edges \
 -reducer aggregate
hfs -cat /num_descendants/* | sort -k2 -t"    " -nr > descendants.sorted

# draw graph of the top 1000
head -n 200 descendants.sorted > descendants.top200
./filter_nodes.py descendants.top200 < edges > filtered.edges.top200
./to_dot.py filtered.edges.top200 descendants.top200 | dot -Tpng > top200.png