Transferred URLs to .name. Slight CSS tweaks for comment counts. New …

…posts
mreid · Jan 20, 2009 · c1c7d01 · c1c7d01
1 parent c18f97c
commit c1c7d01
Show file tree

Hide file tree

Showing 8 changed files with 225 additions and 568 deletions.
diff --git a/_scripts/import_comments.rb b/_scripts/import_comments.rb
@@ -25,6 +25,8 @@
 DB_USER = 'root'
 DB_NAME = 'inductio'
 
+TARGET_URL = 'http://mark.reid.dev/iem/'
+
 # Gets the first forum key associated with USER_KEY
 def forum_key
   forum_list = get('get_forum_list', :user_api_key => USER_KEY)
@@ -53,37 +55,38 @@ def convert(row)
     }
 end
 
-def clean(comment)
+# Remove extraneous paragraph separators. Disqus interprets double 
-  comment.gsub!(/<\/?p>/,'')
+# newline as paragraphs
-  comment
+def clean(comment) 
+  comment.gsub(/<\/?p>/,'')
 end
 
+# Compute the URL for the given comment based on the DB entry
 def url(row)
   date_path = row[:comment_date_gmt].strftime("%Y/%m/%d")
-  "http://mark.reid.dev/iem/#{row[:post_name]}.html"
+  "#{TARGET_URL}#{row[:post_name]}.html"
 end
 
+# Get the Disqus thread ID for the comment in the DB row
 def thread(row) 
-  ident_str = "test-#{row[:post_name]}"
+  ident_str = "#{row[:post_name]}"
   data = { 
     :forum_api_key => FORUM_KEY,
     :title => row[:post_title], 
     :identifier => ident_str
   }
 
-  puts "Getting thread #{ident_str}..."
   response = JSON.parse( DISQUS['thread_by_identifier'].post(data) )
+  unless response['succeeded']
+    raise "Bad response to get thread ID for #{ident_str}"
+  end
 
-  # puts response.to_yaml
+  puts "Set thread [#{ident_str}] title to '#{response['message']['thread']['title']}'"
-  # puts "--- (end thread response)"
-
-  raise "Bad response to get thread ID for #{ident_str}" unless response['succeeded']
-
-  puts "Thread [#{ident_str}] has title '#{response['message']['thread']['title']}'"
 
   response['message']['thread']['id']
 end
 
+# Set the URL of the Disqus thread to the given value
 def update(thread_id, url)
   data = {
     :forum_api_key => FORUM_KEY,
@@ -95,9 +98,11 @@ def update(thread_id, url)
   response = JSON.parse( DISQUS['update_thread'].post(data) )
 end
 
-# Converts and sends a comment from the DB to Disqus with the given thread ID
 @unconverted = []
 @threads = {}
+# Converts and sends a comment from the DB to Disqus with the given thread ID
+# Failed conversions are stored in @unconverted and the thread_id to URL mapping
+# in @threads is updated
 def post(row, thread_id)
   data = convert(row)
   data[:forum_api_key] = FORUM_KEY
@@ -111,28 +116,36 @@ def post(row, thread_id)
     @threads[thread_id] = url(row)
   else
     puts "\tWARNING: Could not post comment by #{data[:author_name]} on #{data[:created_at]}"
+    puts data.to_yaml + "\n---"
     @unconverted << data
   end
 end
 
 # Processing begins here...
 DB_PASS = ENV['DB_PASS']
-DB = Sequel.mysql(DB_NAME, :user=>DB_USER, :password=>DB_PASS, :host=>'localhost')
+DB = Sequel.mysql(DB_NAME, 
+  :user=>DB_USER, :password=>DB_PASS, :host=>'localhost', :encoding => 'utf8'
+)
 
 USER_KEY = ENV['DISQUS_KEY']
 FORUM_KEY = forum_key
 
-LIMIT = "limit 10"
+QUERY = "select * from wp_comments, wp_posts where wp_comments.comment_post_ID = wp_posts.ID and comment_type != 'pingback'"
-QUERY = "select * from wp_comments, wp_posts where wp_comments.comment_post_ID = wp_posts.ID and comment_type != 'pingback' #{LIMIT}"
 DB[QUERY].each do |row|
   puts "Processing #{row[:comment_type]} comment #{row[:comment_ID]}..."
   thread_id = thread(row)
   post(row, thread_id)
 end
 
-puts "Number of failures: #{@unconverted.length}"
+# Update all of the threads with the correct URL
-
 @threads.each do |tid,url| 
   update(tid,url)
 end
 
+# Print unconverted data to STDOUT as YAML
+puts "Number of failures: #{@unconverted.length}"
+puts "\n\n***UNCOVERTED POSTS***"
+@unconverted.each do |data|
+  puts data.to_yaml
+  puts "***"
+end
diff --git a/_scripts/transfer_urls.rb b/_scripts/transfer_urls.rb
@@ -0,0 +1,78 @@
+require 'rubygems'
+require 'rest_client'
+require 'json'
+
+DISQUS_BASE = 'http://disqus.com/api/'
+DISQUS = RestClient::Resource.new DISQUS_BASE
+
+SOURCE_URL = 'http://mark.reid.dev/iem/'
+TARGET_URL = 'http://mark.reid.name/iem/'
+
+THREADS = {
+10211725 => 'http://mark.reid.name/iem/behold-jensens-inequality.html',
+10211748 => 'http://mark.reid.name/iem/feed-bag-a-simple-rss-archiver.html',
+10211737 => 'http://mark.reid.name/iem/visualising-reading.html',
+10211738 => 'http://mark.reid.name/iem/snuck-flied-and-wedded.html',
+10211739 => 'http://mark.reid.name/iem/super-crunchers.html',
+10211728 => 'http://mark.reid.name/iem/colt-2008-highlights.html',
+10211784 => 'http://mark.reid.name/iem/staying-organised-with-citeulike-and-bibdesk.html',
+10211740 => 'http://mark.reid.name/iem/constructive-and-classical-mathematics.html',
+10211730 => 'http://mark.reid.name/iem/the-earth-is-round.html',
+10211753 => 'http://mark.reid.name/iem/information-divergence-and-risk.html',
+10211742 => 'http://mark.reid.name/iem/ml-and-stats-people-on-twitter.html',
+10211720 => 'http://mark.reid.name/iem/a-meta-index-of-data-sets.html',
+10211710 => 'http://mark.reid.name/iem/introducing-inductio-ex-machina.html',
+10211755 => 'http://mark.reid.name/iem/artificial-ai.html',
+10211733 => 'http://mark.reid.name/iem/machine-learning-summer-school-2009.html',
+10211711 => 'http://mark.reid.name/iem/clarity-and-mathematics.html',
+10211713 => 'http://mark.reid.name/iem/a-cute-convexity-result.html',
+}
+
+# Gets the first forum key associated with USER_KEY
+def forum_key
+  forum_list = get('get_forum_list', :user_api_key => USER_KEY)
+  forum_id = forum_list[0]['id']
+  get('get_forum_api_key', :user_api_key => USER_KEY, :forum_id => forum_id)  
+end
+
+# Encapsulates request, JSON parsing and error checking a REST call to Disqus
+def get(command, args)
+  path = command + '?' + args.map {|k,v| "#{k}=#{v}"}.join('&')
+  response = JSON.parse( DISQUS[path].get )
+  raise "Bad response to #{path}" unless response['succeeded']
+  response['message']
+end
+
+def threads
+  thread_list = get('get_thread_list', :forum_api_key => FORUM_KEY)
+end
+
+# Set the URL of the Disqus thread to the given value
+def update(thread_id, url)
+  data = {
+    :forum_api_key => FORUM_KEY,
+    :thread_id => thread_id,
+    :url => url
+  }
+
+  puts "Updating thread #{thread_id} with URL = #{url}"
+  response = JSON.parse( DISQUS['update_thread'].post(data) )
+end
+
+USER_KEY = ENV['DISQUS_KEY']
+FORUM_KEY = forum_key
+
+# Set the new URLs
+# threads.each do |t|
+#   url = THREADS[t['id'].to_i]
+#   next if url.nil?
+#   update(t['id'], url)
+#   puts "Set thread #{t['id']} to #{url}"
+# end
+
+# Check everything worked
+threads.each do |t|
+  url = THREADS[t['id'].to_i]
+  next if url.nil?
+  puts "Thread #{t['id']} has #{url}"
+end
diff --git a/css/screen.css b/css/screen.css
@@ -96,6 +96,7 @@ hr {
 .left.inset { margin-left: 0 !important; }
 
 .list .title { font-weight: bold; }
+.comments { font-size: smaller; display: block; float: right; color: silver;}
 .excerpt { color: black; }
 
 blockquote {

diff --git a/iem/_posts/2009-01-06-information-divergence-and-risk.markdown b/iem/_posts/2009-01-06-information-divergence-and-risk.markdown
@@ -0,0 +1,37 @@
+---
+layout: post
+
+title: Information, Divergence and Risk for Binary Experiments
+excerpt: A summary of a recent paper Bob and I posted to arXiv.
+location: Canberra, Australia
+
+wordpress_url: http://conflate.net/inductio/?p=175
+wordpress_id: 175
+---
+[Bob Williamson][bob] and I have finished a [report][] outlining what we have been looking at for the last year or so and uploaded it to the arXiv. Weighing in at 89 pages, it covers a lot of ground in an attempt to unify a number of different classes of measures for problems that can be expressed as binary experiments. That is, where instances are drawn from two distributions. This include binary classification, class probability estimation, and hypothesis testing. 
+
+We show that many of the usual measures of difficultly for these problems — divergence, information and Bayes risk — are very closely related. We also look at ways in which members of each class of measure can be expressed in terms of "primitive" members of those classes. In particular, Fisher-consistent losses (also known as proper scoring rules) can be written as weighted sums of cost-sensitive loss while all f-divergences can be written as weighted sums of something akin to cost-sensitive variational divergence. These "Choquet representations" make it easy to derive Pinsker-like bounds for arbitrary f-divergences (not just KL divergence) as well as results similar to those of Bartlett et al in their "[Convexity, classification and Risk Bounds][bartlett]".
+
+It should be made clear that many of these results are not new. However, what I like about our approach is that almost all of the results in the paper stem from a two observations about convex functions: they are invariant under the Legendre-Fenchel bidual, and they have a second-order integral Taylor expansion with non-negative weights.
+
+If any of this sounds interesting, you should grab the full paper from the [arXiv][report]. Here's the abstract:
+
+> We unify f-divergences, Bregman divergences, surrogate loss bounds (regret bounds), 
+> proper scoring rules, matching losses, cost curves, ROC-curves and information. We 
+> do this by systematically studying integral and variational representations of these 
+> objects and in so doing identify their primitives which all are related to cost-sensitive 
+> binary classification. As well as clarifying relationships between generative and 
+> discriminative views of learning, the new machinery leads to tight and more general 
+> surrogate loss bounds and generalised Pinsker inequalities relating f-divergences to 
+> variational divergence. The new viewpoint illuminates existing algorithms: it provides a 
+> new derivation of Support Vector Machines in terms of divergences and relates 
+> Maximum Mean Discrepancy to Fisher Linear Discriminants. It also suggests new 
+> techniques for estimating f-divergences.
+
+Now that we have a good understanding of binary experiments the aim is to build on these results and extend this type of work to other forms of machine learning problems. High on the list are multi-category classification, ranking and regression problems.
+
+Questions, criticism, suggestions and pointers to related work we may have missed are all welcome. 
+
+[bartlett]: http://www.citeulike.org/user/mdreid/article/510440
+[report]: http://arxiv.org/abs/0901.0356
+[bob]: http://axiom.anu.edu.au/~williams/
diff --git a/iem/_posts/2009-01-16-ml-and-stats-people-on-twitter.markdown b/iem/_posts/2009-01-16-ml-and-stats-people-on-twitter.markdown
@@ -0,0 +1,78 @@
+---
+layout: post
+
+title: ML and Stats People on Twitter
+excerpt: Wherein I compile a list of interesting people who use Twitter to discuss machine learning and statistics. 
+location: Canberra, Australia
+
+wordpress_url: http://conflate.net/inductio/?p=171
+wordpress_id: 171
+---
+I started using the social, "micro-blogging" service [Twitter][] in February this year simply because I had been seeing so much commentary about it — both good and bad. Since then, I've posted [800+ updates][me], amassed over 100 [followers][] and [follow][] nearly that many myself.
+
+[twitter]: http://twitter.com/
+[me]: http://twitter.com/mdreid/
+[follow]: http://twitter.com/mdreid/friends
+[followers]: http://twitter.com/mdreid/followers
+
+What has surprised me about Twitter is how many people I have found on there who are active, or at least interested, in machine learning and statistics. The day-to-day discussions, questions, advice and pointers I've got via Twitter have been illuminating and fun.
+
+In an effort to get to know some of these people a bit better I followed the links they provided in their respective profiles to see what they had to say about themselves. The descriptions below are based only on those links as I don't find Google-stalking very friendly.
+
+So, in no particular order, here they are:
+
+Students
+----------
+* [Tim Danford](http://twitter.com/arthegall)     
+A computer science [Ph.D. student at MIT](http://people.csail.mit.edu/tdanford/)
+
+* [Mark James Adams](http://twitter.com/mja)     
+"[I am a student of quantitative genetics and a temperamental psychologist](http://affinity.raysend.com/record/about/author)"
+
+* <a href="http://twitter.com/dwf" rel="nofollow">Dave Warde-Farley</a>     
+[Computer science Masters student at Toronto](http://www.cs.toronto.edu/~dwf/) working in machine learning
+
+* [Amir massoud Farahmand](http://twitter.com/SoloGen)     
+Ph.D. student looking at manifold learning (amongst other things) at the [University of Alberta](http://www.cs.ualberta.ca/~amir/). Runs the blog [thesilog](http://thesilog.sologen.net/).
+
+* [Markus Weimer](http://twitter.com/markusweimer)     
+Graduate student working on "[applications of machine learning to eLearning](http://weimo.de/about)". Also runs a [blog](http://weimo.de/)
+
+* [Ryan Rosario](http://twitter.com/DataJunkie)     
+Statistics and computer science graduate student.
+
+* [A.M. Santos](http://twitter.com/ansate)     
+Maths and statistics graduate student.
+
+Non-students
+---------------
+* [Neal Richter](http://twitter.com/nealrichter)     
+Neal Richter - Runs the blog [aicoder](http://aicoder.blogspot.com/)
+
+* [Brendan O'Connor](http://twitter.com/brendan642)     
+[Research assistant](http://anyall.org/) in NLP at Stanford and consultant at [Dolores Labs](http://blog.doloreslabs.com/)
+
+* [Daniel Tunkelang](http://twitter.com/dtunkelang)     
+Chief scientist at the information retrieval company Endeca and owner of the blog [The Noisy Channel](http://thenoisychannel.com/)
+
+* [Jason Adams](http://twitter.com/ealdent)     
+Computational linguist work on sentiment analysis. Runs the blog [The Mendicant Bug](http://mendicantbug.com/).
+
+* [Mikio Braun](http://twitter.com/mikiobraun)     
+Post-doc at Technische Universität Berlin and a machine learning blogger at [Marginally Interesting](http://mikiobraun.blogspot.com/).
+
+* [Daniel Lemire](http://twitter.com/lemire)     
+Professor of computer science at the University of Quebec at Montreal and [blogger](http://www.daniel-lemire.com/blog/).
+
+* [Jason H. Moore](http://twitter.com/moorejh)     
+Professor of Genetics, Director of Bioinformatics at Dartmouth Medical School. Works on the [Multi-factor Dimensionality Reduction](http://sourceforge.net/projects/mdr/) software MDR and blogs at [Epistasis](http://compgen.blogspot.com/).
+
+* [Pete Skomoroch](http://twitter.com/peteskomoroch)     
+Director of analytics at Juice Analytics and [Data Wrangling](http://www.datawrangling.com/) blogger.
+
+* [Alex Smola](http://twitter.com/smolix)     
+Principal Researcher at Yahoo! Research and ex-colleague of mine at [NICTA](http://nicta.com.au) and the [ANU](http://anu.edu.au) a.k.a. "Mr. Kernel"
+
+If you are not on this list but think you should be, leave a comment below and I'll update this list. Conversely, if I've put you on this list and you don't wish to be associated with these sorts of people, leave a comment or send me an email and I'll remove you.
+
+Of course, feel free to follow [me][] if you'd like to keep up with what I'm doing.