# Stack Overflow Python Questions/Answers Building 

## Import and check total dataset

In [1]:
from google.cloud import bigquery

In [2]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="config.json"

In [3]:
client = bigquery.Client()

In [4]:
dataset_ref = client.dataset('stackoverflow', project='bigquery-public-data')

In [5]:
type(dataset_ref)

google.cloud.bigquery.dataset.DatasetReference

In [6]:
dset = client.get_dataset(dataset_ref)

In [7]:
type(dset)

google.cloud.bigquery.dataset.Dataset

In [8]:
[x.table_id for x in client.list_tables(dset)]

['badges',
 'comments',
 'post_history',
 'post_links',
 'posts_answers',
 'posts_moderator_nomination',
 'posts_orphaned_tag_wiki',
 'posts_privilege_wiki',
 'posts_questions',
 'posts_tag_wiki',
 'posts_tag_wiki_excerpt',
 'posts_wiki_placeholder',
 'stackoverflow_posts',
 'tags',
 'users',
 'votes']

In [9]:
full_questions = client.get_table(dset.table('posts_questions'))

In [10]:
full_answers = client.get_table(dset.table('posts_answers'))

## Building questions dataset

In [11]:
full_questions.schema

[SchemaField('id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('title', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('body', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('accepted_answer_id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('answer_count', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('comment_count', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('community_owned_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('creation_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('favorite_count', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('last_activity_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('last_edit_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('last_editor_display_name', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('last_editor_user_id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('owner_display_name', 'STRING', 'NULLABLE', None, (), None),
 Sch

In [12]:
QUERY_QUESTIONS = """
SELECT 
  q.id, 
  q.title, 
  q.body, 
  q.tags,
  q.score,
  q.accepted_answer_id,
  q.answer_count
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q  
WHERE q.tags LIKE '%python%' AND q.answer_count >= 3
LIMIT 200000
"""

data_query = client.query(QUERY_QUESTIONS)
rows = data_query.result()

In [13]:
! rm questions_data.csv answers_data.csv

In [14]:
import csv

In [15]:
q_ids = []
with open('questions_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'title', 'body', 'tags', 'score', 'accepted_answer_id', 'answer_count'])
    for row in rows:
      qid = row.id
      q_ids.append(str(qid))
      writer.writerow([qid, row.title, row.body, row.tags, row.score, row.accepted_answer_id, row.answer_count])

In [37]:
print("number of question: " + str(len(q_ids)))

number of question: 200000


## Building corresponding answers dataset

In [17]:
full_answers.schema

[SchemaField('id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('title', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('body', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('accepted_answer_id', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('answer_count', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('comment_count', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('community_owned_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('creation_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('favorite_count', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('last_activity_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('last_edit_date', 'TIMESTAMP', 'NULLABLE', None, (), None),
 SchemaField('last_editor_display_name', 'STRING', 'NULLABLE', None, (), None),
 SchemaField('last_editor_user_id', 'INTEGER', 'NULLABLE', None, (), None),
 SchemaField('owner_display_name', 'STRING', 'NULLABLE', None, (), None),
 Schema

In [29]:
# query more than 1024k characters is not supported, so broken it into 20 parts

def get_qids_str(start):
  q_ids_list = ", ".join((q_ids[start: start+10000]))
  return "(" + q_ids_list + ")"

In [30]:
def get_rows(qids_list):
  QUERY_ANSWERS = """
  SELECT 
    a.id,
    a.parent_id,
    a.body, 
    a.score
  FROM `bigquery-public-data.stackoverflow.posts_answers` AS a 
  WHERE a.parent_id IN {}
  """.format(qids_list)

  data_query = client.query(QUERY_ANSWERS)
  return data_query.result()

In [None]:
with open('answers_data.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['id', 'parent_id', 'body', 'tags', 'score'])
    for i in range(0, 200000, 10000):
      rows = get_rows(get_qids_str(i))
      // print(rows.total_rows)
      for row in rows:
        writer.writerow([row.id, row.parent_id, row.body, row.score])

## Checking the built datasets

In [38]:
import pandas as pd

In [39]:
dataset_questions = pd.read_csv('questions_data.csv')

In [40]:
dataset_questions

Unnamed: 0,id,title,body,tags,score,accepted_answer_id,answer_count
0,17605659,Why does it take longer to import a function f...,<p>Consider:</p>\n\n<pre><code>&gt;&gt;&gt; ti...,python|performance|python-import|python-internals,25,17605700.0,3
1,17244049,Finding label location in a DataFrame Index,<p>I have a pandas dataframe:</p>\n\n<pre><cod...,python|pandas,17,17244095.0,3
2,17220308,how to find all ip addresses between 2 ip addr...,<p>Can anyone think of an algorithm to put all...,python|ipv4,-5,17220389.0,3
3,17295086,Python joining current directory and parent di...,<p>I want to do join the current directory pat...,python|os.path,29,58559790.0,3
4,17170752,Python OpenCV load image from byte string,<p>I'm trying to load image from string like a...,python|image|opencv|byte,41,17170855.0,3
...,...,...,...,...,...,...,...
199995,61698471,Reguler Expresion in python,<p>I have this text:</p>\n\n<pre><code>DIAGNOS...,python|regex,-1,61698537.0,3
199996,61697471,How to sort multiple list in python,<p>I am trying to sort multiple lists based on...,python|python-3.x|sorting,-1,61698929.0,4
199997,61823204,Python: How to store multiple values for one key,<p>I am new to Python… I have set of lookup ke...,python,-1,61824759.0,3
199998,61657099,How to call a group of functions,<p>I don't know how to do it so that the resul...,python|function|tkinter,-1,61657427.0,3


In [41]:
dataset_answers = pd.read_csv('answers_data.csv')

In [42]:
dataset_answers

Unnamed: 0,id,parent_id,body,tags,score
0,12523376,4830856,"<p>It also works with <a href=""http://en.wikip...",33,
1,12647556,12647471,<p>As I told you in a comment to a previous an...,34,
2,12064662,2965271,<p>You can force people to use keyword argumen...,37,
3,12056240,4096506,<p>I had trouble with Paulo's method (see my c...,39,
4,12588805,1408940,<p>Oraculum has got it right. You shouldn't be...,41,
...,...,...,...,...,...
793120,35051301,35050938,<p>You started well. Your code:</p>\n\n<pre><c...,0,
793121,35097016,35096817,<p>Here is my solution with recursion. This is...,0,
793122,35003952,35003905,"<p>To answer this directly,</p>\n\n<pre><code>...",0,
793123,35023707,35023651,"<p>how about:</p>\n\n<pre><code>text = ""AANGCT...",0,
