## Installing Stackoverflow Api

In [None]:
%%capture
! pip install StackAPI

Collecting StackAPI
  Downloading StackAPI-0.2.0.tar.gz (5.6 kB)
Building wheels for collected packages: StackAPI
  Building wheel for StackAPI (setup.py) ... [?25l[?25hdone
  Created wheel for StackAPI: filename=StackAPI-0.2.0-py3-none-any.whl size=5857 sha256=7993dd08fd68398f5231de1c8ac2bc3ff5a924a82573df61c2bdebff3c40d998
  Stored in directory: /root/.cache/pip/wheels/ec/db/60/df42a65853e3581c26a2fbb2012a228cb8e267369a3b9ca44d
Successfully built StackAPI
Installing collected packages: StackAPI
Successfully installed StackAPI-0.2.0


## Importing Modules

In [None]:
from stackapi import StackAPI
from datetime import datetime
import pandas as pd

## Settings of Stackoverflow api

In [None]:
SITE = StackAPI('stackoverflow')
SITE.page_size = 100  # Fetch 100 results per page, means less hits to stackoverflow api
SITE.max_pages = 500  # Maximum 300 pages can be fetched according to quota

## All Questions tagged to pytorch in Stackoverflow

In [None]:
questions = SITE.fetch('questions', tagged='pytorch')
len(questions['items'])

13372

## Using Pandas Dataframe to store the data.

Storing the data based on all questions and answered questions in pytorch tagged to stackoverflow.

In [None]:
Stackoverflow_questions_pytorch = pd.DataFrame.from_records(questions['items'])
# Stackoverflow_questions_pytorch[Stackoverflow_questions_pytorch['is_answered']==True].shape
Stackoverflow_questions_pytorch.to_excel('Pytorch_questions_stackoverflow.xlsx', index=False)
Stackoverflow_questions_pytorch[Stackoverflow_questions_pytorch['is_answered']==True].to_excel('Answered_Pytorch_questions_stackoverflow.xlsx', index=False)

# Reading Data to fetch the question body from the data

In [None]:
questions = pd.read_excel('/content/Answered_Pytorch_questions_stackoverflow.xlsx')

In [None]:
question_ids = questions['question_id'].to_list()

Loop to fetch the data and store it in question_body_dataframe

In [None]:
questions_body_dataframe = pd.DataFrame()
for i in range(0, int(len(question_ids)/100)):
  questions_body = SITE.fetch('questions/{ids}', ids=question_ids[100*i:100*i+100], filter='withbody')
  questions_body_dataframe = questions_body_dataframe.append(pd.DataFrame.from_records(questions_body['items'], columns=['question_id', 'body']), ignore_index=True) 

Fetching the remaining left

In [None]:
i=i+1
questions_body = SITE.fetch('questions/{ids}', ids=question_ids[100*i:], filter='withbody')
questions_body_dataframe = questions_body_dataframe.append(pd.DataFrame.from_records(questions_body['items'], columns=['question_id', 'body']), ignore_index=True) 

Merging with already read dataframe

In [None]:
questions_body_dataframe.rename(columns={'body':'question_body'}, inplace=True)
questions = questions.merge(questions_body_dataframe, on = ['question_id'], how='inner')

Writing back the same file

In [None]:
questions.to_excel('Answered_Pytorch_questions_stackoverflow_with_body.xlsx', index=False)

## Fetching the Answers body from Stackoverflow Api

In [None]:
answers_body_dataframe = pd.DataFrame()
for i in range(0, int(len(question_ids)/100)):
  answer_body = SITE.fetch('questions/{ids}/answers', ids=question_ids[100*i:100*i+100], filter='withbody')
  answers_body_dataframe = answers_body_dataframe.append(pd.DataFrame.from_records(answer_body['items']), ignore_index=True) 

Fetching the remaining left

In [None]:
i = i+1
answer_body = SITE.fetch('questions/{ids}', ids=question_ids[100*i:100*i+100], filter='withbody')
answers_body_dataframe = answers_body_dataframe.append(pd.DataFrame.from_records(answer_body['items']), ignore_index=True) 

Checking the size of answered question

In [None]:
len(answers_body_dataframe['question_id'].unique())

7322

Storing back to pandas excel file

In [None]:
!pip install xlsxwriter

Collecting xlsxwriter
  Downloading XlsxWriter-1.4.5-py2.py3-none-any.whl (149 kB)
[?25l[K     |██▏                             | 10 kB 22.5 MB/s eta 0:00:01[K     |████▍                           | 20 kB 28.2 MB/s eta 0:00:01[K     |██████▋                         | 30 kB 28.3 MB/s eta 0:00:01[K     |████████▊                       | 40 kB 21.1 MB/s eta 0:00:01[K     |███████████                     | 51 kB 9.7 MB/s eta 0:00:01[K     |█████████████▏                  | 61 kB 10.0 MB/s eta 0:00:01[K     |███████████████▎                | 71 kB 8.2 MB/s eta 0:00:01[K     |█████████████████▌              | 81 kB 9.2 MB/s eta 0:00:01[K     |███████████████████▊            | 92 kB 8.1 MB/s eta 0:00:01[K     |█████████████████████▉          | 102 kB 8.8 MB/s eta 0:00:01[K     |████████████████████████        | 112 kB 8.8 MB/s eta 0:00:01[K     |██████████████████████████▎     | 122 kB 8.8 MB/s eta 0:00:01[K     |████████████████████████████▌   | 133 kB 8.8 MB/s eta

In [None]:
answers_body_dataframe.to_excel('Answered_Pytorch_answers_stackoverflow_with_body.xlsx', index=False, engine='xlsxwriter')

## Samples

In [None]:
from pprint import pprint

pprint(SITE.fetch('questions/{ids}', ids=[64837376, 55126493], filter='withbody'))
pprint(SITE.fetch('questions/{ids}/answers', ids=[64837376], filter='withbody'))

{'backoff': 0,
 'has_more': False,
 'items': [{'answer_id': 66814746,
            'body': '<p>For my case system is already set to system managed '
                    'size, yet I have same error, that is because I pass a big '
                    'sized variable to multiple processes within a function. '
                    'Likely I need to set a very large paging file as Windows '
                    'cannot create it on the fly, but instead opt out to '
                    'reduce number of processes as it is not an always to be '
                    'used function.</p>\n'
                    '<p>If you are in Windows it may be better to use 1 (or '
                    'more) core less than total number of <strong>pysical '
                    'cores</strong> as multiprocessing module in python in '
                    'Windows tends to get everything as possible if you use '
                    'all and actually tries to get all '
                    '<strong>logical</strong> cor