For a better version of the code, please take a look at `scrape_stack_exchange.py`

In [1]:
import os
import xml.etree.cElementTree as et
from multiprocessing import Pool, cpu_count
from pathlib import Path
import pandas as pd
import requests
import wget
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

In [2]:
PATH_TO_SX_DUMP = Path('../../input/sx_dump')
!mkdir -p $PATH_TO_SX_DUMP

In [3]:
r = requests.get("https://archive.org/download/stackexchange")
soup = BeautifulSoup(r.content, "html.parser")


In [4]:
listing_table = soup.find('table', class_='directory-listing-table')

In [5]:
links = listing_table.findAll('a', href=True)
len(links)

715

**Write all links to a file**

In [6]:
link_list = '\n'.join(['https://archive.org/download/stackexchange/' + l['href'] for l in links[1:]])

In [7]:
with open(PATH_TO_SX_DUMP / 'link_list.txt','w') as f:
    f.write(link_list)

**For demonstration we pick the first two (skipping duplicates)**

You'll need 7z to extract archives.

In [8]:
link_list.split('\n')[:4]

['https://archive.org/download/stackexchange/3dprinting.meta.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/3dprinting.meta.stackexchange.com.7z/',
 'https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z',
 'https://archive.org/download/stackexchange/3dprinting.stackexchange.com.7z/']

In [9]:
for link in tqdm(links[1:5]):
    filename = str(link['href'])
    if filename[-1] != '/':
        print(f"Downloading {filename}...")
        url = 'https://archive.org/download/stackexchange/' + filename
        filename = wget.download(url, out=str(PATH_TO_SX_DUMP))
        print(f"7z e {filename} -o{filename.rstrip('.7z')}")
        os.system(f"7z e {filename} -o{filename.rstrip('.7z')}")

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))

Downloading 3dprinting.meta.stackexchange.com.7z...
7z e ../../input/sx_dump/3dprinting.meta.stackexchange.com.7z -o../../input/sx_dump/3dprinting.meta.stackexchange.com
Downloading 3dprinting.stackexchange.com.7z...
7z e ../../input/sx_dump/3dprinting.stackexchange.com.7z -o../../input/sx_dump/3dprinting.stackexchange.com



**Parsing a single catalog - `3dprinting.stackexchange.com/`**

In [10]:
stackexchange_dir = PATH_TO_SX_DUMP / '3dprinting.stackexchange.com/'

In [11]:
def xml_to_pandas(root, columns, row_name='row'):
    df = None
    try:

        rows = root.findall('.//{}'.format(row_name))

        xml_data = [[row.get(c) for c in columns] for row in rows]  # NESTED LIST

        df = pd.DataFrame(xml_data, columns=columns)
    except Exception as e:
        print('[xml_to_pandas] Exception: {}.'.format(e))

    return df

In [12]:
def parse_xml_dump(pathes):
    stackexchange_dir, output_dir = pathes
    
    path = stackexchange_dir / 'Users.xml'
    columns =['Id', 'Reputation', 'DisplayName']

    root = et.parse(path)
    user_df = xml_to_pandas(root, columns)
    user_df = user_df.rename(columns ={
        'Reputation':'user_reputation',
        'DisplayName':'username',
        'Id':'OwnerUserId',
    })
    
    path = stackexchange_dir / 'Posts.xml'
    columns = [
        'AcceptedAnswerId',
        'AnswerCount',
        'Body',
        'ClosedDate',
        'CommentCount',
        'CreationDate',
        'FavoriteCount',
        'Id',
        'LastActivityDate',
        'OwnerUserId',
        'ParentId',
        'PostTypeId',
        'Score',
        'Title',
        'ViewCount']

    root = et.parse(path)
    posts_df = xml_to_pandas(root, columns)

    question_columns = ['Id',
     'CreationDate',
     'Score',
     'ViewCount',
     'Body',
     'OwnerUserId',
     'LastActivityDate',
     'Title',
     'AnswerCount',
     'CommentCount',
     'FavoriteCount',
     'AcceptedAnswerId',
     'ClosedDate',]

    answer_columns =['Id',
     'CreationDate',
     'Score',
     'Body',
     'OwnerUserId',
     'LastActivityDate',
     'CommentCount',
     'ParentId']

    question_df = posts_df[posts_df['PostTypeId']== '1'][question_columns]
    answer_df = posts_df[posts_df['PostTypeId']== '2'][answer_columns]

    answer_df = answer_df.merge(user_df, on='OwnerUserId')
    question_df = question_df.merge(user_df, on='OwnerUserId')
    
    answer_df.to_csv(output_dir / 'answers.tsv', sep='\t', index=False)
    question_df.to_csv(output_dir / 'questions.tsv', sep='\t', index=False)
    
    return question_df, answer_df

In [13]:
output_dir = PATH_TO_SX_DUMP / 'stackexchange_parsed'

dumps = list(PATH_TO_SX_DUMP.glob('*com'))
dumps = [path for path in dumps if '.meta' not in path.name]

outputs = [output_dir / path.name for path in dumps]
for path in outputs:
    if not path.exists():
        os.makedirs(str(path))

In [14]:
with Pool(cpu_count()) as pool:
    list(tqdm(pool.imap(parse_xml_dump, zip(dumps, outputs)), total=len(dumps)))

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




**Get questions and answers**

In [15]:
three_d_printing_output = PATH_TO_SX_DUMP / 'stackexchange_parsed/3dprinting.stackexchange.com/'
three_d_printing_dump = PATH_TO_SX_DUMP / '3dprinting.stackexchange.com'

In [16]:
question_df, answer_df = parse_xml_dump((three_d_printing_dump, three_d_printing_output))

In [21]:
question_df.head()

Unnamed: 0,Id,CreationDate,Score,ViewCount,Body,OwnerUserId,LastActivityDate,Title,AnswerCount,CommentCount,FavoriteCount,AcceptedAnswerId,ClosedDate,user_reputation,username
0,1,2016-01-12T18:45:19.963,10,303,<p>When I've printed an object I've had to cho...,16,2017-10-31T02:31:08.560,How to obtain high resolution prints in a shor...,2,6,,51,,1783,Adam Davis
1,4,2016-01-12T18:50:55.973,17,307,<p>Plastic is used in 3D FDM/FFF printing part...,16,2016-06-10T13:32:20.493,Are there any metals that exhibit a large glas...,4,0,2.0,1289,,1783,Adam Davis
2,2,2016-01-12T18:45:51.287,28,3212,"<p>I would like to buy a 3D printer, but I'm c...",20,2019-06-10T23:18:34.190,Is 3D printing safe for your health?,5,1,3.0,12,,2245,kenorb
3,6,2016-01-12T18:57:13.350,10,539,<p>My MakerBot printer supports only two filam...,20,2018-09-16T12:35:19.097,Multi-color printing with desktop 3D printer?,5,0,1.0,27,,2245,kenorb
4,11,2016-01-12T19:07:53.343,46,69868,<p>The surfaces of my printed parts using PLA ...,20,2019-05-14T19:08:09.893,How do I give 3D-printed parts in PLA a shiny ...,9,0,20.0,34,,2245,kenorb


In [17]:
answer_df.head()

Unnamed: 0,Id,CreationDate,Score,Body,OwnerUserId,LastActivityDate,CommentCount,ParentId,user_reputation,username
0,9,2016-01-12T18:58:04.410,17,<p>Almost all 3D printers have issues that cou...,16,2016-01-12T18:58:04.410,2,2,1783,Adam Davis
1,39,2016-01-12T20:00:22.177,15,<p>The files used to print these objects are d...,16,2016-01-12T20:00:22.177,0,33,1783,Adam Davis
2,43,2016-01-12T20:05:22.530,6,"<p>In theory, making filament is easy. You ta...",16,2019-01-18T15:13:48.737,2,38,1783,Adam Davis
3,53,2016-01-12T20:24:10.350,4,<blockquote>\n <p>parts ... I don't want to ....,16,2016-01-13T12:37:29.580,0,14,1783,Adam Davis
4,57,2016-01-12T20:30:29.773,6,<p>Using negative pressure ventilation and a s...,16,2016-01-12T20:30:29.773,0,49,1783,Adam Davis


The corresponding files a saved here

In [20]:
!ls $three_d_printing_output

answers.tsv   questions.tsv
