In [1]:
from datetime import datetime
import lxml.etree as ET

def parse_post(row):
  """
  <row Id="1" PostTypeId="1" AcceptedAnswerId="15"
  CreationDate="2010-07-19T19:12:12.510" Score="19" ViewCount="1033"
  Body="&lt;p&gt;How should I elicit prior distributions from experts when
  fitting a Bayesian model?&lt;/p&gt;&#xA;" OwnerUserId="8"
  LastActivityDate="2010-09-15T21:08:26.077" Title="Eliciting priors from
  experts" Tags="&lt;bayesian&gt;&lt;prior&gt;&lt;elicitatihttps://159.203.106.96:8889/notebooks/miniprojects/spark/data/allPosts/project_new.ipynb#on&gt;"
  AnswerCount="5" CommentCount="1" FavoriteCount="11" />
  """

  try:
    if row.startswith("  <row ") and row.endswith("/>"):
      root = ET.fromstring(row)
    else:
      return None
  except:
    return None

  return Post(root.attrib.get('Id', None),
              root.attrib.get('PostTypeId', None),
              root.attrib.get('AcceptedAnswerId', None),
              root.attrib.get('CreationDate', None),
              root.attrib.get('Score', None),
              root.attrib.get('ViewCount', None),
              root.attrib.get('Body', ""),
              root.attrib.get('OwnerUserId', None),
              root.attrib.get('LastActivityDate', None),
              root.attrib.get('Title', ""),
              root.attrib.get('Tags', "").strip("><").split("><"),
              root.attrib.get('AnswerCount', None),
              root.attrib.get('CommentCount', None),
              root.attrib.get('FavoriteCount', None),
             )

class Post(object):
  def __init__(self, post_id, post_type_id, accepted_answer_id, creation_date,
               score, view_count, body, owner_user_id, last_activity_date,
               title, tags, answer_count, comment_count, favorite_count):
    self.post_id = self.process_int(post_id)
    self.post_type_id = self.process_int(post_type_id)
    self.accepted_answer_id = self.process_int(accepted_answer_id)
    self.creation_date = self.process_time(creation_date)
    self.score = self.process_int(score)
    self.view_count = self.process_int(view_count)
    self.body = body
    self.owner_user_id = self.process_int(owner_user_id)
    self.last_activity_date = self.process_time(last_activity_date)
    self.title = title
    self.tags = tags
    self.answer_count = self.process_int(answer_count)
    self.comment_count = self.process_int(comment_count)
    self.favorite_count = self.process_int(favorite_count)

  def process_int(self, field):
    try:
      return int(field)
    except:
      return None

  def process_time(self, field):
    try:
      return datetime.strptime(field, "%Y-%m-%dT%H:%M:%S.%f")
    except:
      return None


In [2]:
from pyspark import SparkContext
sc = SparkContext("local[*]", "temp")
print sc.version  # should be >= 1.5.1 for distributed matrices

1.5.1


In [3]:
import os
def localpath(path):
    return 'file://' + str(os.path.abspath(os.path.curdir)) + '/' + path

In [19]:
# Basic wordcount
lines = sc.textFile(localpath('part-*'))
post=lines.map(parse_post).filter(lambda x:x is not None)

In [20]:
post.count()

108741

In [29]:
post.map(lambda x: x.title).take(10)

['',
 '',
 '',
 'Are two empirically estimated Markov chains statistically different?',
 '',
 'Probability of winning a tournament',
 'Is there a way to use cross validation to do variable/feature selection in R?',
 '',
 '',
 '']