# StackOverflow answer classifier

In [6]:
import re
import os
from html.parser import HTMLParser
from xml.etree import ElementTree as etree
from xml.etree.ElementTree import Element

import numpy as np
import pandas as pd

In [25]:
class HTML2String(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.strict = False
        self.convert_charrefs= True
        self.text = []
    def handle_data(self, d):
        self.text.append(d)
    def get_data(self):
        return ''.join(self.text)

def html_to_string(html):
    s = HTML2String()
    s.feed(html)
    return s.get_data()

def count_code_lines(html):
    count_lines = 0
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        text = html_to_string(code)
        lines = text.split('\n')
        lines = list(filter(lambda x: len(x) > 0, lines))
        count_lines += len(lines)
    return count_lines

def remove_code_from_html(html):
    nocode = html
    codes = re.findall(r'<code>.*?</code>', html, re.DOTALL)
    for code in codes:
        nocode = nocode.replace(code, '')
    return nocode

In [27]:
html_to_string(remove_code_from_html(dict(elem.items())['Body']))

"I'm trying to set up an app that configures my instances upon launch and I want to close down that app's API access as much as possible. My current policy is as follows:\n\n\n\nHowever, this allows the app to perform any of these actions on anything in EC2. Is there a way I can lock down the actions of the app on an ec2 instance to either that specific instance, or to all of the boxes that have the same IAM role?"

In [None]:
url_base = 'http://stackoverflow.com/'
url_base_q = url_base + 'questions/'
url_base_a = url_base + 'questions/'
url_base_u = url_base + 'users/'

In [7]:
qs_columns = ['id', 'author_id', 'date', 'title', 'text', 'score', 'tags', 'view_count', 'answer_count', 'comment_count', 'code_line_count', 'url']
as_columns = ['id', 'author_id', 'question_id', 'date', 'text', 'comment_count', 'code_line_count', 'score', 'accepted', 'url']
us_columns = ['id', 'name', 'total_questions', 'unanswered_questions', 'total_answers', 'accepted_answers', 'reputation', 'badge_total', 'url']
qa_columns = ['id', 'text', 'q_score', 'a_score', 'qer_reputation', 'aer_reputation', 'qer_percent_answered_questions', 'aer_percent_accepted_answers', 'aer_badge_total', 'a_code_lenth', 'is_qa']

questions_df = pd.DataFrame(columns=qs_columns)
answers_df = pd.DataFrame(columns=as_columns)
users_df = pd.DataFrame(columns=us_columns)
qas_df = pd.DataFrame(columns=qa_columns)

In [None]:
def question_to_df(question):
    q_id = question['Id']
    author_id = question['OwnerUserId']
    date = question['CreationDate']
    title = question['Title']
    text = html_to_string(remove_code_from_html(question['Body']))
    score = question['Score']
    tags = question['Tags']
    view_count = question['ViewCount']']
    answer_count = question['ViewCount']
    comment_count = question['CommentCount']
    code_line_count = count_code_lines(question['Body'])
    url = url_base_q + q_id

    info = [q_id, author_id, date, title, text, score, tags, view_count, answer_count, comment_count, code_line_count, url]
    q_df = pd.DataFrame([info], columns=qs_columns)
    return q_df

def answer_to_df(answer, accepted=False):
    a_id = answer['Id']
    author_id = answer['OwnerUserId']
    question_id = answer['ParentId']
    date = answer['CreationDate']
    text = html_to_string(remove_code_from_html(answer['Body']))
    comment_count = answer['CommentCount']
    code_line_count = code_line_count(answer['Body'])
    score = answer['Score']
    accepted = accepted
    url = url_base_a + question_id + '/' + a_id

    info = [a_id, author_id, question_id, date, text, comment_count, code_line_count, score, accepted, url]
    a_df = pd.DataFrame([info], columns=as_columns)
    return a_df

def user_to_df(user):
    u_id = user.id
    name = user.display_name
    total_questions = so.questions(user_id=user.id).total
    unanswered_questions = count_unanswered_questions(user)
    total_answers = so.answers(user_id=user.id).total
    accepted_answers = count_accepted_answers(user)
    reputation = user.reputation.real
    badge_total = user.badge_total
    url = user.url

    info = [u_id, name, total_questions, unanswered_questions, total_answers, accepted_answers, reputation, badge_total, url]
    u_df = pd.DataFrame([info], columns=us_columns)
    return u_df

In [47]:
dict(elem.items())

{'AboutMe': '<p>I am:</p>\n\n<ul>\n<li>the co-founder and CEO of <a href="http://stackexchange.com">Stack Exchange</a></li>\n<li>the co-founder of <a href="http://www.fogcreek.com" rel="nofollow">Fog Creek Software</a></li>\n<li>the creator and chairman of the board of <a href="http://trello.com" rel="nofollow">Trello</a></li>\n<li>owner of Taco, the most famous Siberian Husky on the Upper West Side.</li>\n</ul>\n\n<p>You can find me on Twitter (as <a href="http://twitter.com/spolsky" rel="nofollow">@spolsky</a>) or on my rarely-updated blog, <a href="http://joelonsoftware.com" rel="nofollow">Joel on Software</a>.</p>\n',
 'AccountId': '4',
 'CreationDate': '2008-07-31T14:22:31.317',
 'DisplayName': 'Joel Spolsky',
 'DownVotes': '94',
 'Id': '4',
 'LastAccessDate': '2015-03-06T22:54:27.657',
 'Location': 'New York, NY',
 'ProfileImageUrl': 'http://i.stack.imgur.com/C5gBG.jpg?s=128&g=1',
 'Reputation': '23699',
 'UpVotes': '760',
 'Views': '58608',
 'WebsiteUrl': 'http://www.joelonsoftw

In [41]:
pd.DataFrame([[1,None,3]]).to_csv('2.csv')

In [28]:
data_path = '/media/antonio/92088d7f-1ed4-49dd-b55f-01462ab87ebb/so_data'

##Questions

In [52]:
xml_file = 'Badges.xml'
xml_path = os.path.join(data_path, xml_file)

In [53]:
count = 0
iterparser = etree.iterparse(xml_path)
iterator = iter(iterparser)
event, root = next(iterator)
root.clear()

for event, elem in iterparser:
    print(dict(elem.items()))
    count +=1
    if count == 4:
#     if elem.get('Id') == '22746288':
#         print(dict(elem.items()))
        break
    elem.clear()

{'Name': 'Teacher', 'Date': '2008-09-15T08:55:03.957', 'UserId': '994', 'Id': '82947'}
{'Name': 'Teacher', 'Date': '2008-09-15T08:55:03.957', 'UserId': '3893', 'Id': '82949'}
{'Name': 'Teacher', 'Date': '2008-09-15T08:55:03.957', 'UserId': '4591', 'Id': '82950'}
{'Name': 'Teacher', 'Date': '2008-09-15T08:55:03.957', 'UserId': '5196', 'Id': '82951'}


In [12]:
elem.get('Id')

'82956'

In [18]:
elem.items()

[('UserId', '652'),
 ('Id', '82956'),
 ('Date', '2008-09-15T08:55:03.957'),
 ('Name', 'Teacher')]

In [19]:
elem.get('Name')

'Teacher'

In [20]:
elem.tag

'row'

In [24]:
next(elem)

TypeError: 'xml.etree.ElementTree.Element' object is not an iterator

In [35]:
root.tag

'badges'

In [36]:
elem

<Element 'row' at 0x7f0098070368>

In [38]:
root.clear()

In [40]:
root.tag

'badges'

In [9]:
elem.items()

[('LastEditDate', '2014-01-23T22:52:51.697'),
 ('LastActivityDate', '2014-01-23T22:52:51.697'),
 ('LastEditorUserId', '3052751'),
 ('Id', '20864452'),
 ('Body',
  '<p>Is there a way to use textpath using SnapSVG? I tried using textPath as an attribute but it does not seem to add a textpath element in the text node.</p>\n\n<pre><code>var txtpth = s.path("M70 70 Q 80 90 200 150 L 200 400").attr({\n    fill: "none",\n    stroke: "black"\n });\nvar crooked = s.text(0,0,"lorempsum ipsum lorempsum ipsum lorempsum ipsum lorempsum   ipsum").attr({\n     textPath: txtpth,\n     stroke:"black"\n });\n</code></pre>\n\n<p>I do not see a direct API to manipulate text paths in SVG using snap.svg.</p>\n'),
 ('CreationDate', '2014-01-01T00:10:40.443'),
 ('AnswerCount', '3'),
 ('CommentCount', '2'),
 ('PostTypeId', '1'),
 ('Tags', '<javascript><text><svg><path><snap.svg>'),
 ('OwnerUserId', '2452938'),
 ('ViewCount', '1170'),
 ('Title', 'How to use textPath for text in snap.svg?'),
 ('Score', '1')]

In [44]:
elem.get('AcceptedAnswerId') is not None

True