#StackOverflow answer classifier

In [35]:
import re
import os
from html.parser import HTMLParser
from xml.etree import ElementTree as etree
from xml.etree.ElementTree import Element

import numpy as np
import pandas as pd

In [36]:
class HTML2String(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.strict = False
        self.convert_charrefs= True
        self.text = []
    def handle_data(self, d):
        self.text.append(d)
    def get_data(self):
        return ''.join(self.text)

def html_to_string(html):
    s = HTML2String()
    s.feed(html)
    return s.get_data()

def count_code_lines(html):
    count_lines = 0
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        text = html_to_string(code)
        lines = text.split('\n')
        lines = list(filter(lambda x: len(x) > 0, lines))
        count_lines += len(lines)
    return count_lines

def remove_code_from_html(html):
    nocode = html
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        nocode = nocode.replace(code, '')
    return nocode

In [37]:
url_base = 'http://stackoverflow.com/'
url_base_q = url_base + 'questions/'
url_base_a = url_base + 'questions/'
url_base_u = url_base + 'users/'

In [225]:
qs_columns = ['id', 'author_id', 'date', 'title', 'text', 'score', 'view_count', 'answer_count', 'comment_count', 'code_line_count', 'url']
as_columns = ['id', 'author_id', 'question_id', 'date', 'text', 'comment_count', 'code_line_count', 'score', 'accepted', 'url']
us_columns = ['id', 'date', 'name', 'reputation', 'total_question_count', 'answered_question_count', 'total_answer_count', 'accepted_answer_count', 'url']
qa_columns = ['id', 'text', 'q_score', 'a_score', 'qer_reputation', 'aer_reputation', 'qer_percent_answered_questions', 'aer_percent_accepted_answers', 'aer_badge_total', 'a_code_lenth', 'is_qa']

questions_df = pd.DataFrame(columns=qs_columns)
answers_df = pd.DataFrame(columns=as_columns)
users_df = pd.DataFrame(columns=us_columns)
qas_df = pd.DataFrame(columns=qa_columns)

In [226]:
def add_question(question):
    q_id = question.get('Id')
    author_id = question.get('OwnerUserId')
    date = question.get('CreationDate')
    title = question.get('Title')
    text = html_to_string(remove_code_from_html(question.get('Body')))
    score = int(question.get('Score'))
    view_count = int(question.get('ViewCount'))
    answer_count = int(question.get('AnswerCount'))
    comment_count = int(question.get('CommentCount'))
    code_line_count = count_code_lines(question.get('Body'))
    url = url_base_q + q_id
    
    info = [q_id, author_id, date, title, text, score, view_count, answer_count, comment_count, code_line_count, url]
    q_df = pd.DataFrame([info], columns=qs_columns)
    global questions_df
    questions_df = questions_df.append(q_df, ignore_index=True)

def answer_to_df(answer, accepted=False):
    a_id = answer.get('Id')
    author_id = answer.get('OwnerUserId')
    question_id = answer.get('ParentId')
    date = answer.get('CreationDate')
    text = html_to_string(remove_code_from_html(answer.get('Body')))
    comment_count = int(answer.get('CommentCount'))
    code_line_count = code_line_count(answer.get('Body'))
    score = int(answer.get('Score'))
    accepted = accepted
    url = url_base_a + question_id + '/' + a_id

    info = [a_id, author_id, question_id, date, text, comment_count, code_line_count, score, accepted, url]
    a_df = pd.DataFrame([info], columns=as_columns)
    return a_df

def user_to_df(user_id):
    u_id = user_id
    date = ''
    name = ''
    reputation = 0
    total_question_count = 0
    answered_question_count = 0
    total_answer_count = 0
    accepted_answer_count = 0
    url = url_base_u + u_id

    info = [u_id, date, name, reputation, total_question_count, answered_question_count, total_answer_count, accepted_answer_count, url]
    u_df = pd.DataFrame([info], columns=us_columns)
    return u_df

def update_user_info(user):
    row_index = get_row_index(users_df, user.get('Id'))
    
    users_df.loc[row_index, 'date'] = user.get('CreationDate')
    users_df.loc[row_index, 'name'] = user.get('DisplayName')
    users_df.loc[row_index, 'reputation'] = int(user.get('Reputation'))

def update_user_question_counts(user_id, answered_question=False):
    row_index = get_row_index(users_df, user_id)
    
    users_df.loc[row_index, 'total_question_count'] += 1
    if answered_question:
        users_df.loc[row_index, 'answered_question_count'] += 1

def update_user_answer_counts(user_id, accepted_answer=False):
    row_index = get_row_index(users_df, user_id)
    
    users_df.loc[row_index, 'total_answer_count'] += 1
    if accepted_answer:
        users_df.loc[row_index, 'accepted_answer_count'] += 1

def get_row_index(df, value):
    row_index_list = df[df.id==value].index.tolist()
    assert(len(row_index_list) == 1)
    row_index = row_index_list[0]
    return row_index

def is_new_user(user_id):
    return user_id not in users_df.id.values

In [84]:
data_path = '/media/antonio/92088d7f-1ed4-49dd-b55f-01462ab87ebb/so_data'

##Questions

In [140]:
xml_file = 'Questions-2014.xml'
xml_path = os.path.join(data_path, xml_file)

In [227]:
accepted_answer_ids = np.array([])

count = 0

iterparser = etree.iterparse(xml_path)
iterator = iter(iterparser)
event, root = next(iterator)
root.clear()

for event, elem in iterparser:
    add_question(elem)
    
    accepted_answer_id = elem.get('AcceptedAnswerId')
    is_accepted = accepted_answer_id is not None
    if is_accepted:
        np.append(accepted_answer_ids, accepted_answer_id)
    
    author_id = elem.get('OwnerUserId')
    if is_new_user(author_id):
        u_df = user_to_df(author_id)
        users_df = users_df.append(u_df, ignore_index=True)
    update_user_question_counts(author_id, answered_question=is_accepted)
    
    count +=1
    if count == 100:
        break
    elem.clear()

In [231]:
questions_df

Unnamed: 0,id,author_id,date,title,text,score,view_count,answer_count,comment_count,code_line_count,url
0,20864423,1377324,2014-01-01T00:02:46.990,Sphinx complex queries with mix of AND/OR,I've been looked around and have no luck on th...,0,44,1,0,7,http://stackoverflow.com/questions/20864423
1,20864424,2212490,2014-01-01T00:02:57.477,Google App Scripts get IP,How can I restrict a script on GAS to load onl...,1,127,1,0,0,http://stackoverflow.com/questions/20864424
2,20864427,2779244,2014-01-01T00:03:24.030,PHP form processing issues,I have been trying numerous ways to get the em...,-2,108,1,10,206,http://stackoverflow.com/questions/20864427
3,20864429,382775,2014-01-01T00:03:39.953,IAM policy to allow EC2 instance API access on...,I'm trying to set up an app that configures my...,0,686,1,0,16,http://stackoverflow.com/questions/20864429
4,20864430,2592623,2014-01-01T00:03:42.410,PHP Mailing Address Preg_match not working! Ki...,I am trying to make a regular expression to ma...,0,45,2,1,4,http://stackoverflow.com/questions/20864430
5,20864431,3142972,2014-01-01T00:03:48.130,Java swing timer break,Is there any way to do this? This is for a sim...,0,109,2,5,60,http://stackoverflow.com/questions/20864431
6,20864435,1140270,2014-01-01T00:06:22.700,cefsharp location change event C#,i need to know how can set event when location...,0,482,1,0,9,http://stackoverflow.com/questions/20864435
7,20864449,3017954,2014-01-01T00:09:20.590,"Ruby on Rails ""Undefined method 'encoding'"" wh...",I have a table defined with the following migr...,2,147,1,3,6,http://stackoverflow.com/questions/20864449
8,20864450,1019976,2014-01-01T00:10:14.867,generating json for google charts - adding nulls,I need to generate a null value in json if a r...,0,69,2,0,68,http://stackoverflow.com/questions/20864450
9,20864452,2452938,2014-01-01T00:10:40.443,How to use textPath for text in snap.svg?,Is there a way to use textpath using SnapSVG? ...,1,1170,3,2,8,http://stackoverflow.com/questions/20864452


In [32]:
if x is None: print(1)

1


In [80]:
questions_df.loc['score', 0] = 3

In [81]:
questions_df

Unnamed: 0,id,author_id,date,title,text,score,view_count,answer_count,comment_count,code_line_count,url,0
0,20864423.0,1377324.0,2014-01-01T00:02:46.990,Sphinx complex queries with mix of AND/OR,I've been looked around and have no luck on th...,3.0,44.0,1.0,0.0,7.0,http://stackoverflow.com/questions/20864423,
1,20864423.0,1377324.0,2014-01-01T00:02:46.990,Sphinx complex queries with mix of AND/OR,I've been looked around and have no luck on th...,0.0,44.0,1.0,0.0,7.0,http://stackoverflow.com/questions/20864423,
2,20864423.0,1377324.0,2014-01-01T00:02:46.990,Sphinx complex queries with mix of AND/OR,I've been looked around and have no luck on th...,0.0,44.0,1.0,0.0,7.0,http://stackoverflow.com/questions/20864423,
3,20864423.0,1377324.0,2014-01-01T00:02:46.990,Sphinx complex queries with mix of AND/OR,I've been looked around and have no luck on th...,0.0,44.0,1.0,0.0,7.0,http://stackoverflow.com/questions/20864423,
score,,,,,,,,,,,,3.0


In [15]:
np.array(['1','2'])

array(['1', '2'], 
      dtype='<U1')

In [34]:
elem.get('OwnerId')

In [47]:
dict(elem.items())

{'AboutMe': '<p>I am:</p>\n\n<ul>\n<li>the co-founder and CEO of <a href="http://stackexchange.com">Stack Exchange</a></li>\n<li>the co-founder of <a href="http://www.fogcreek.com" rel="nofollow">Fog Creek Software</a></li>\n<li>the creator and chairman of the board of <a href="http://trello.com" rel="nofollow">Trello</a></li>\n<li>owner of Taco, the most famous Siberian Husky on the Upper West Side.</li>\n</ul>\n\n<p>You can find me on Twitter (as <a href="http://twitter.com/spolsky" rel="nofollow">@spolsky</a>) or on my rarely-updated blog, <a href="http://joelonsoftware.com" rel="nofollow">Joel on Software</a>.</p>\n',
 'AccountId': '4',
 'CreationDate': '2008-07-31T14:22:31.317',
 'DisplayName': 'Joel Spolsky',
 'DownVotes': '94',
 'Id': '4',
 'LastAccessDate': '2015-03-06T22:54:27.657',
 'Location': 'New York, NY',
 'ProfileImageUrl': 'http://i.stack.imgur.com/C5gBG.jpg?s=128&g=1',
 'Reputation': '23699',
 'UpVotes': '760',
 'Views': '58608',
 'WebsiteUrl': 'http://www.joelonsoftw

In [41]:
pd.DataFrame([[1,None,3]]).to_csv('2.csv')

In [12]:
elem.get('Id')

'82956'

In [18]:
elem.items()

[('UserId', '652'),
 ('Id', '82956'),
 ('Date', '2008-09-15T08:55:03.957'),
 ('Name', 'Teacher')]

In [19]:
elem.get('Name')

'Teacher'

In [20]:
elem.tag

'row'

In [24]:
next(elem)

TypeError: 'xml.etree.ElementTree.Element' object is not an iterator

In [35]:
root.tag

'badges'

In [36]:
elem

<Element 'row' at 0x7f0098070368>

In [38]:
root.clear()

In [40]:
root.tag

'badges'

In [9]:
elem.items()

[('LastEditDate', '2014-01-23T22:52:51.697'),
 ('LastActivityDate', '2014-01-23T22:52:51.697'),
 ('LastEditorUserId', '3052751'),
 ('Id', '20864452'),
 ('Body',
  '<p>Is there a way to use textpath using SnapSVG? I tried using textPath as an attribute but it does not seem to add a textpath element in the text node.</p>\n\n<pre><code>var txtpth = s.path("M70 70 Q 80 90 200 150 L 200 400").attr({\n    fill: "none",\n    stroke: "black"\n });\nvar crooked = s.text(0,0,"lorempsum ipsum lorempsum ipsum lorempsum ipsum lorempsum   ipsum").attr({\n     textPath: txtpth,\n     stroke:"black"\n });\n</code></pre>\n\n<p>I do not see a direct API to manipulate text paths in SVG using snap.svg.</p>\n'),
 ('CreationDate', '2014-01-01T00:10:40.443'),
 ('AnswerCount', '3'),
 ('CommentCount', '2'),
 ('PostTypeId', '1'),
 ('Tags', '<javascript><text><svg><path><snap.svg>'),
 ('OwnerUserId', '2452938'),
 ('ViewCount', '1170'),
 ('Title', 'How to use textPath for text in snap.svg?'),
 ('Score', '1')]

In [44]:
elem.get('AcceptedAnswerId') is not None

True