In [1]:
#!/usr/bin/env python3
# -*- python -*-

from xml.etree import cElementTree
import csv
import numpy as np
import pandas as pd
import html as parser
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def handle_starttag(self, tag, attrs):
        if tag == 'a':
            link = [value + ' ' for name,value in attrs if name == 'href']
            self.fed.extend(link)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    html = parser.unescape(html)
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# Copy/paste from the help section on SEDE
# http://data.stackexchange.com/stackoverflow/query/new
def get_schema():
    schema = {
        'Id':                    np.uint,
        'PostTypeId':            np.uint8,
        'AcceptedAnswerId':      np.uint,
        'ParentId':              np.uint,
        'CreationDate':          np.str,
        'DeletionDate':          np.str,
        'Score':                 np.int,
        'ViewCount':             np.uint,
        'Body':                  np.str,
        'OwnerUserId':           np.uint,
        'OwnerDisplayName':      np.str,
        'LastEditorUserId':      np.uint,
        'LastEditorDisplayName': np.str,
        'LastEditDate':          np.str,
        'LastActivityDate':      np.str,
        'Title':                 np.str,
        'Tags':                  np.str,
        'AnswerCount':           np.uint,
        'CommentCount':          np.uint,
        'FavoriteCount':         np.uint,
        'ClosedDate':            np.str,
        'CommunityOwnedDate':    np.str,
    }
    return schema


def dump(fh):
    """
    Read a stream from FH, yield next Post row
    """
    schema = get_schema()
    events = cElementTree.iterparse(fh)
    for event, elem in events:
        if elem.tag != 'row':
            continue
        result = []
        for key, dtype in schema.items():
            try:
                result.append(elem.attrib[key])
            except KeyError:
                if dtype in [np.int, np.uint, np.uint8]:
                    result.append('0')
                elif 'date' in key.lower():
                    result.append('1970-01-01T00:00:00.000')
                else:
                    result.append('EMP')
        yield result
        elem.clear()

def parse_to_csv(infile='data/Posts.xml', outfile='data/posts.csv'):
    with open(infile, encoding='utf-8') as infile:
        writer = csv.writer(open(outfile, 'w', encoding='utf-8'))
        
        cnt = 0
        # write cols names first
        writer.writerow(get_schema().keys())
        for line in dump(infile):
            writer.writerow(line)
            cnt += 1
            
    print('Processed {} lines'.format(cnt))
    
def read_data(infile='data/posts.csv'):
    df = pd.read_csv(infile, encoding='utf-8')
    df = df.astype(dtype=get_schema())
    return df

def parse_data(df):
    """
    Parse the input data, removing html tags
    """
    df['Body'] = df['Body'].apply(lambda body: strip_tags(body))
    return df

In [2]:
parse_to_csv()

Processed 32830 lines


In [3]:
raw_df = read_data()
print(raw_df.columns)

Index(['Id', 'PostTypeId', 'AcceptedAnswerId', 'ParentId', 'CreationDate',
       'DeletionDate', 'Score', 'ViewCount', 'Body', 'OwnerUserId',
       'OwnerDisplayName', 'LastEditorUserId', 'LastEditorDisplayName',
       'LastEditDate', 'LastActivityDate', 'Title', 'Tags', 'AnswerCount',
       'CommentCount', 'FavoriteCount', 'ClosedDate', 'CommunityOwnedDate'],
      dtype='object')


In [4]:
df = parse_data(raw_df)
df.loc[df['Title'] != 'EMP','Body'] = df['Title'] + '. ' + df['Body']
df = df.replace('EMP', '')
df = df[['Id', 'PostTypeId', 'ParentId', 'AcceptedAnswerId', 'Score', 'Body']]
df.iloc[1:10,:]

Unnamed: 0,Id,PostTypeId,ParentId,AcceptedAnswerId,Score,Body
1,3,1,0,31,6,"If you kill someone who is committing suicide,..."
2,4,2,1,0,109,"""http://en.wikipedia.org/wiki/Intuitionism Int..."
3,6,1,0,13,20,Is atheism a requirement for a consistent exis...
4,8,1,0,0,6,"If we value free agency, how can punishment - ..."
5,9,2,6,0,7,An existentialist philosophy is nothing more t...
6,10,2,1,0,6,Mathematics is an abstraction. As such it is i...
7,11,2,6,0,7,"God is negotiable under existential tenets, as..."
8,12,1,0,0,16,What's the difference between Randian philosop...
9,13,2,6,0,16,Perhaps a better question to ask is the invers...


In [5]:
# some preprocessing to match questions with best answers
question_no_answer = df[(df['PostTypeId'] == 1) & (df['AcceptedAnswerId'] == 0)]
question_with_answer = df[(df['PostTypeId'] == 1) & (df['AcceptedAnswerId'] > 0)]
accepted_answer = df[(df['PostTypeId'] == 2) & (df['ParentId'].isin(question_with_answer['Id']))]
answer = df[(df['PostTypeId'] == 2) & (~df.index.isin(accepted_answer.index))]
# answer = answer.sort_values(['ParentId', 'Score'], ascending=[True,False]).groupby(['ParentId']).first()
# answer = answer.reset_index()

In [6]:
question_with_answer = pd.merge(question_with_answer, accepted_answer, left_on='AcceptedAnswerId', right_on='Id', how='left', suffixes=['_q','_a'])
question_no_answer = pd.merge(question_no_answer, answer, left_on='Id', right_on='ParentId', how='left', suffixes=['_q','_a'])

In [7]:
final_df = pd.concat([question_with_answer, question_no_answer])
final_df = final_df[['Body_q', 'Body_a']]
final_df.head()

Unnamed: 0,Body_q,Body_a
0,Was mathematics invented or discovered?. What ...,"""http://en.wikipedia.org/wiki/Intuitionism Int..."
1,"If you kill someone who is committing suicide,...",An autonomy/consent perspective strongly disti...
2,Is atheism a requirement for a consistent exis...,Perhaps a better question to ask is the invers...
3,"What is the difference between ""necessary"" and...","The difference between ""necessary"" and ""suffic..."
4,What is Philosophy?. What is a comprehensive d...,Philosophy is the practice of discovering new ...


In [26]:
# just save everything in one big text file
with open('data/posts.txt', 'w', encoding='utf-8') as outfile:
    for bq, ba in zip(final_df['Body_q'], final_df['Body_a']): 
        outfile.write(u'{}'.format(bq))
        outfile.write(u'{}'.format(ba))
#         break