In [1]:
import nltk
from nltk.tokenize import RegexpTokenizer
import json
import re
from collections import defaultdict
import requests
import html



In [2]:
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jitender.singh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
class SEAnswers:
    """
    fetches questions and answers from stack exchange sites.
    """

    sites = {
        'stackoverflow': {
            'api_url': 'https://api.stackexchange.com/2.3',
            'questions_api': '/questions',
            'question_args': '?site={site}&tagged={tag}&sort={sort_by}&order={sort_order}&page={page_id}',
            'answers_api': '/questions/{question_ids}/answers',
            'answers_args': '?site={site}&filter=withbody',
        },
    }

    def __init__(self, site_name='stackoverflow'):
        self.site_name = site_name

    # https://api.stackexchange.com/2.3/questions?page=1&order=desc&sort=votes&tagged=nlp&site=stackoverflow
    def get_question_ids(self, tag='nlp', sort_by='votes', sort_order='desc', page_id=1):

        site = self.sites[self.site_name]
        question_args = site['question_args'].format(
            site=self.site_name, tag=tag, sort_by=sort_by, sort_order=sort_order, page_id=page_id)

        url = '{api_url}{questions_api}{question_args}'.format(
            api_url=site['api_url'],
            questions_api=site['questions_api'],
            question_args=question_args,
        )
        print(url)

        response = requests.get(url)
        question_ids = []
        if response.status_code == 200:
            json_response = response.json()
            for item in json_response['items']:
                if item.get('is_answered') and item.get('answer_count') > 0:
                    question_ids.append(item.get('question_id'))
        return question_ids

    def get_answers(self, question_ids):
        site = self.sites[self.site_name]
        qid_str = ';'.join([str(x) for x in question_ids])
        answers_api = site['answers_api'].format(question_ids=qid_str)
        answers_args = site['answers_args'].format(site=self.site_name)

        url = '{api_url}{answers_api}{answers_args}'.format(
            api_url=site['api_url'],
            answers_api=answers_api,
            answers_args=answers_args,
        )
        print(url)

        response = requests.get(url)
        answers = []
        if response.status_code == 200:
            json_response = response.json()
            for item in json_response['items']:
                if item.get('body'):
                    answers.append(item['body'])

        return answers


class WordCounter:

    def __init__(self):
        self.words = defaultdict(lambda: 0)
        self.word_tokenizer = RegexpTokenizer(r'\w+')

    def clean_html_tags(self, html_text):
        pattern = re.compile('<.*?>')
        return re.sub(pattern, '', html_text)

    def count_words(self, answers):
        answers_html_str = ' '.join(answers)

        # clean html tags first.
        answers_str = self.clean_html_tags(answers_html_str)
        answers_str = html.unescape(answers_str)
        word_tokens = self.word_tokenizer.tokenize(answers_str)
        for word_token in word_tokens:
            if word_token:
                # convert all tokens to lowercase and count frequency
                self.words[word_token.lower()] += 1

    def get_words(self, min_frequency=1):
        word_dict = dict()
        for word, frequency in self.words.items():
            if frequency >= min_frequency:
                word_dict[word] = frequency
        return word_dict


se = SEAnswers()
wc = WordCounter()

qids = se.get_question_ids(tag='nlp')
answers = se.get_answers(qids)

wc.count_words(answers)
print(sorted(wc.get_words(min_frequency=1).items(), key=lambda x: x[1], reverse=True))


https://api.stackexchange.com/2.3/questions?site=stackoverflow&tagged=nlp&sort=votes&order=desc&page=1
https://api.stackexchange.com/2.3/questions/307291;8897593;1833252;34870614;1787110;405161;15547409;9294926;31421413;10401076;22904025;41424;771918;27860652;1288291;10850997;13883277;870460;3522372;573768;27697766;51956000;4951751;19130512;526469;15173225;10554052;17317418;10383044;9706769/answers?site=stackoverflow&filter=withbody
[('the', 237), ('a', 125), ('to', 110), ('is', 104), ('and', 103), ('in', 101), ('of', 100), ('you', 93), ('for', 74), ('it', 74), ('words', 73), ('i', 62), ('word', 58), ('if', 58), ('as', 56), ('1', 55), ('that', 54), ('n', 54), ('this', 52), ('can', 49), ('are', 46), ('t', 41), ('be', 40), ('s', 40), ('or', 39), ('with', 37), ('my', 36), ('from', 35), ('c', 34), ('an', 32), ('0', 30), ('similarity', 29), ('2', 29), ('there', 28), ('have', 27), ('syllables', 27), ('x', 27), ('would', 26), ('on', 26), ('not', 26), ('by', 24), ('use', 24), ('return', 24), (

In [6]:
qids = se.get_question_ids(tag='java')
answers = se.get_answers(qids)

wc.count_words(answers)
print(sorted(wc.get_words(min_frequency=1).items(), key=lambda x: x[1], reverse=True))


https://api.stackexchange.com/2.3/questions?site=stackoverflow&tagged=java&sort=votes&order=desc&page=1
https://api.stackexchange.com/2.3/questions/11227809;6841333;40480;309424;271526;40471;157944;13375357;363681;8710619;8881291;46898;1066589;6470651;215497;322715;5585779;285793;1005073;34571;21947452;574594;6343166;65035;1128723;285177;37628;6827752;1200621;541487/answers?site=stackoverflow&filter=withbody
