#StackOverflow answer classifier

In [1]:
import re
import os
from html.parser import HTMLParser
from time import time
from xml.etree import ElementTree as etree
from xml.etree.ElementTree import Element

import numpy as np
import pandas as pd

In [2]:
class HTML2String(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.strict = False
        self.convert_charrefs= True
        self.text = []
    def handle_data(self, d):
        self.text.append(d)
    def get_data(self):
        return ''.join(self.text)

def html_to_string(html):
    s = HTML2String()
    s.feed(html)
    return s.get_data()

def count_code_lines(html):
    count_lines = 0
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        text = html_to_string(code)
        lines = text.split('\n')
        lines = list(filter(lambda x: len(x) > 0, lines))
        count_lines += len(lines)
    return count_lines

def remove_code_from_html(html):
    nocode = html
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        nocode = nocode.replace(code, '')
    return nocode

In [3]:
url_base = 'http://stackoverflow.com/'
url_base_q = url_base + 'questions/'
url_base_a = url_base + 'questions/'
url_base_u = url_base + 'users/'

In [4]:
qs_columns = ['id', 'author_id', 'date', 'title', 'text', 'score', 'view_count', 'answer_count', 'comment_count', 'code_line_count', 'url']
as_columns = ['id', 'author_id', 'question_id', 'date', 'text', 'score', 'accepted', 'comment_count', 'code_line_count', 'url']
us_columns = ['id', 'date', 'name', 'reputation', 'total_question_count', 'answered_question_count', 'total_answer_count', 'accepted_answer_count', 'url']

questions_df = pd.DataFrame(columns=qs_columns)
answers_df = pd.DataFrame(columns=as_columns)
users_df = pd.DataFrame(columns=us_columns)

In [5]:
def add_question(question):
    q_id = question.get('Id')
    author_id = question.get('OwnerUserId')
    date = question.get('CreationDate')
    title = question.get('Title')
    text = html_to_string(remove_code_from_html(question.get('Body')))
    score = int(question.get('Score'))
    view_count = int(question.get('ViewCount'))
    answer_count = int(question.get('AnswerCount'))
    comment_count = int(question.get('CommentCount'))
    code_line_count = count_code_lines(question.get('Body'))
    url = url_base_q + q_id
    
    info = [q_id, author_id, date, title, text, score, view_count, answer_count, comment_count, code_line_count, url]
    q_df = pd.DataFrame([info], columns=qs_columns)
    global questions_df
    questions_df = questions_df.append(q_df, ignore_index=True)

def add_answer(answer, accepted=False):
    a_id = answer.get('Id')
    author_id = answer.get('OwnerUserId')
    question_id = answer.get('ParentId')
    date = answer.get('CreationDate')
    text = html_to_string(remove_code_from_html(answer.get('Body')))
    score = int(answer.get('Score'))
    accepted = accepted
    comment_count = int(answer.get('CommentCount'))
    code_line_count = count_code_lines(answer.get('Body'))
    url = url_base_a + question_id + '/' + a_id

    info = [a_id, author_id, question_id, date, text, score, accepted, comment_count, code_line_count, url]
    a_df = pd.DataFrame([info], columns=as_columns)
    global answers_df
    answers_df = answers_df.append(a_df, ignore_index=True)

def add_user(user_id):
    u_id = user_id
    date = ''
    name = ''
    reputation = 0
    total_question_count = 0
    answered_question_count = 0
    total_answer_count = 0
    accepted_answer_count = 0
    url = url_base_u + u_id

    info = [u_id, date, name, reputation, total_question_count, answered_question_count, total_answer_count, accepted_answer_count, url]
    u_df = pd.DataFrame([info], columns=us_columns)
    global users_df
    users_df = users_df.append(u_df, ignore_index=True)

def update_user_info(user):
    user_id = user.get('Id')
    
    users_df.loc[users_df.id==user_id, 'date'] = user.get('CreationDate')
    users_df.loc[users_df.id==user_id, 'name'] = user.get('DisplayName')
    users_df.loc[users_df.id==user_id, 'reputation'] = int(user.get('Reputation'))

def update_user_question_counts(user_id, answered_question=False):
    users_df.loc[users_df.id==user_id, 'total_question_count'] += 1
    if answered_question:
        users_df.loc[users_df.id==user_id, 'answered_question_count'] += 1

def update_user_answer_counts(user_id, accepted_answer=False):
    users_df.loc[users_df.id==user_id, 'total_answer_count'] += 1
    if accepted_answer:
        users_df.loc[users_df.id==user_id, 'accepted_answer_count'] += 1

def is_new_user(user_id):
    return user_id not in users_df.id.values

In [6]:
data_path = '/media/antonio/92088d7f-1ed4-49dd-b55f-01462ab87ebb/so_data'

##Questions

In [7]:
xml_file = 'Questions-2014-01-0.xml'
xml_path = os.path.join(data_path, xml_file)

In [8]:
accepted_answer_ids = np.array([])

iterparser = etree.iterparse(xml_path, events=('start',))
next(iterparser)

elem_count = 0
t0 = time()
# count = 0
for event, elem in iterparser:
#     if count == 100:
#         break
#     count +=1
    
    
    add_question(elem)
    
    accepted_answer_id = elem.get('AcceptedAnswerId')
    is_accepted = accepted_answer_id is not None
    if is_accepted:
        accepted_answer_ids = np.append(accepted_answer_ids, accepted_answer_id)
    
    author_id = elem.get('OwnerUserId')
    if author_id is None:
        author_id = elem.get('OwnerDisplayName')
    if is_new_user(author_id):
        add_user(author_id)
    update_user_question_counts(author_id, answered_question=is_accepted)
    
    elem.clear()
    
    elem_count += 1
    if elem_count % 5000 == 0:
        print('%d elements processed' % elem_count)    
t1 = time()

print('-' * 80)
print('Elements processed: %d' % elem_count)
print('Processing time: %d seconds' % (t1 - t0))
print('-' * 80)

KeyboardInterrupt: 

In [None]:
print('elem_count == len(questions_df) = %s' % (elem_count == len(questions_df)))

In [None]:
questions_df.head()

In [None]:
questions_df.to_csv('data/questions.csv')

##Answers

In [None]:
xml_file = 'Answers-2014-01.xml'
xml_path = os.path.join(data_path, xml_file)

In [None]:
question_ids = np.array(questions_df['id'])

iterparser = etree.iterparse(xml_path, events=('start',))
next(iterparser)

elem_count = 0
t0 = time()
# count = 0
for event, elem in iterparser:
#     count +=1
#     if count == 1000:
#         break
    
    
    
    if elem.get('ParentId') not in question_ids:
        elem.clear()
        continue
    
    is_accepted = elem.get('Id') in accepted_answer_ids
    add_answer(elem, is_accepted)
    
    author_id = elem.get('OwnerUserId')
    if author_id is None:
        author_id = elem.get('OwnerDisplayName')
    if is_new_user(author_id):
        add_user(author_id)
    update_user_answer_counts(author_id, accepted_answer=is_accepted)
    
    elem.clear()
    
    elem_count += 1
    if elem_count % 5000 == 0:
        print('%d elements processed' % elem_count) 
t1 = time()

print('-' * 80)
print('Elements processed: %d' % elem_count)
print('Processing time: %d seconds' % (t1 - t0))
print('-' * 80)

In [None]:
print('elem_count == len(answers_df) = %s' % (elem_count == len(answers_df)))

In [None]:
answers_df.head()

In [None]:
answers_df.to_csv('data/answers.csv')

##Users

In [None]:
xml_file = 'Users.xml'
xml_path = os.path.join(data_path, xml_file)

In [None]:
user_ids = np.array(users_df['id'])

iterparser = etree.iterparse(xml_path, events=('start',))
next(iterparser)

elem_count = 0
t0 = time()
for event, elem in iterparser:
    if elem.get('Id') not in user_ids:
        elem.clear()
        continue
    
    update_user_info(elem)
    
    elem.clear()
    
    elem_count += 1
    if elem_count % 5000 == 0:
        print('%d elements processed' % elem_count) 
t1 = time()

print('-' * 80)
print('Elements processed: %d' % elem_count)
print('Processing time: %d seconds' % (t1 - t0))
print('-' * 80)

In [None]:
print('elem_count == len(users_df) = %s' % (elem_count == len(users_df)))

In [None]:
users_df.head()

In [None]:
users_df.to_csv('data/users.csv')

##Question-Answer

In [None]:
qs_df = questions_df[['id', 'author_id', 'title', 'text', 'score', 'view_count', 'answer_count', 'comment_count', 'code_line_count']]
as_df = answers_df[['id', 'author_id', 'question_id', 'text', 'comment_count', 'code_line_count', 'score', 'accepted']]
us_df = users_df[['id', 'reputation', 'total_question_count', 'answered_question_count', 'total_answer_count', 'accepted_answer_count']]

In [None]:
qus_df = pd.merge(qs_df, us_df, left_on='author_id', right_on='id', suffixes=('_q', '_u')).drop('id_u', axis=1)
aus_df = pd.merge(as_df, us_df, left_on='author_id', right_on='id', suffixes=('_a', '_u')).drop('id_u', axis=1)
qas_df = pd.merge(qus_df, aus_df, left_on='id_q', right_on='question_id', suffixes=('_q', '_a')).drop('question_id', axis=1)

In [None]:
# id = id_q-id_a
qas_df['id'] = qas_df[['id_q', 'id_a']].apply(lambda x: x['id_q'] + '-' + x['id_a'], axis=1)

# text = title - text_q -- text_a
qas_df['text'] = qas_df[['title', 'text_q', 'text_a']].apply(lambda x: x['title'] + ' - ' + x['text_q'] + ' -- ' + x['text_a'], axis=1)

# percent_answered_questions_q = answered_question_count_q * 100 / total_question_count_q
qas_df['percent_answered_questions_q'] = qas_df[['answered_question_count_q', 'total_question_count_q']].apply(lambda x: round(100 * x['answered_question_count_q'] / x['total_question_count_q']), axis=1)

# percent_accepted_answers_a = accepted_answer_count_a * 100 / total_answer_count_a
qas_df['percent_answered_questions_a'] = qas_df[['accepted_answer_count_a', 'total_answer_count_a']].apply(lambda x: round(100 * x['accepted_answer_count_a'] / x['total_answer_count_a']), axis=1)

cols_to_delete = ['author_id_q', 'author_id_a', 'total_answer_count_q', 'accepted_answer_count_q', 'total_question_count_a', 'answered_question_count_a']
cols_to_delete.extend(['id_q', 'id_a'])
cols_to_delete.extend(['title', 'text_q', 'text_a'])
cols_to_delete.extend(['answered_question_count_q', 'total_question_count_q'])
cols_to_delete.extend(['accepted_answer_count_a', 'total_answer_count_a'])

qas_df.drop(cols_to_delete, axis=1, inplace=True)

print('-' * 80)
print('Total q-a pairs: %d' % elem_count)
print('-' * 80)

In [None]:
qas_df.head()

In [None]:
users_df.to_csv('data/qas.csv')