# Building Machine Learning Systems with Python - Chapter 4

This code is supporting material for the book `Building Machine Learning Systems with Python` by [Willi Richert](https://www.linkedin.com/in/willirichert/) and [Luis Pedro Coelho](https://www.linkedin.com/in/luispedrocoelho/)  published by PACKT Publishing.

It is made available under the MIT License.

All code examples use Python in version...

In [1]:
import sys
sys.version

'3.6.3 |Anaconda custom (64-bit)| (default, Nov  8 2017, 15:10:56) [MSC v.1900 64 bit (AMD64)]'

# Downloading the data
In this chapter we will use the StackOverflow data from https://archive.org/download/stackexchange (while downloading, you have a couple hours time to contemplate whether now would be a good time to donate to the awesome archive.org :-) )
Since it is updated on a regular basis, you might get slightly different numbers. In this chapter we use this version:
```
stackoverflow.com-Posts.7z                        08-Dec-2017 22:31     11.3G
```

In [2]:
# TODO Should we download and unzip in Python?
# !pip install pylzma

# Extracting it

In [3]:
import os

#DATA_DIR = "data"  # put your posts-2012.xml into this directory
DATA_DIR = r'F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts'
CHART_DIR = "charts"

filtered = os.path.join(DATA_DIR, "filtered.tsv")
filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")

chosen = os.path.join(DATA_DIR, "chosen.tsv")
chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")

In [32]:
import os
import re
from dateutil import parser as dateparser

from operator import itemgetter
from collections import defaultdict
from xml.etree import cElementTree as etree

from tqdm import tqdm_notebook as tqdm # we all love nice progress bars, don't we?

try:
    import ujson as json  # UltraJSON if available
except:
    print("You can also use the normal json module, but you get a XXX speedup if you use ujson instead.")
    raise
    
filename = os.path.join(r"F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts", "posts.xml")
NUM_ROWS = 38485045 # counted by hand
print("Reading from xml %s" % filename)
filename_filtered = os.path.join(DATA_DIR, "filtered.tsv")
print("Filtered: %s" % filename_filtered)
filename_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
print("Meta: %s" % filename_filtered_meta)

q_creation = {}  # creation datetimes of questions
q_accepted = {}  # id of accepted answer

# question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]
meta = defaultdict(list)

# regegx to find code snippets
code_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)
link_match = re.compile('<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)
img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)


def filter_html(s):
    num_code_lines = 0
    link_count_in_code = 0
    code_free_s = s

    num_images = len(img_match.findall(s))

    # remove source code and count how many lines
    for match_str in code_match.findall(s):
        num_code_lines += match_str.count('\n')
        code_free_s = code_match.sub("", code_free_s)

        # sometimes source code contain links, which we don't want to count
        link_count_in_code += len(link_match.findall(match_str))

    links = link_match.findall(s)
    link_count = len(links)

    link_count -= link_count_in_code

    link_free_s = re.sub(" +", " ", tag_match.sub('', code_free_s)).replace("\n", "")

    for link in links:
        if link.lower().startswith("http://"):
            link_free_s = link_free_s.replace(link, '')

    num_text_tokens = link_free_s.count(" ")

    return link_free_s, num_text_tokens, num_code_lines, link_count, num_images

years = defaultdict(int)
num_questions = 0
num_answers = 0

def parsexml(filename):
    global num_questions, num_answers

    counter = 0

    # etree.iterparse() returns a tuple (event, element). Since we request only
    # 'start' events, we pipe the result through an itemgetter that always returns
    # the 2nd result.
    it = map(itemgetter(1), etree.iterparse(filename, events=('start',)))
    
    # Get the <posts> element, in which we will parse the <row> elements. While doing so,
    # we will need the root handle to clear memory
    root = next(it)
    
    with tqdm(iterable=it, total=NUM_ROWS, file=sys.stdout) as pbar:
        
        for counter, elem in enumerate(it):
            #pbar.set_description('Processed %d <row/> elements' % (1 + counter))
            pbar.update(1)
        
            if elem.tag == 'row':
                creation_date = dateparser.parse(elem.get('CreationDate'))

                Id = int(elem.get('Id'))
                PostTypeId = int(elem.get('PostTypeId'))
                Score = int(elem.get('Score'))

                if PostTypeId == 1:
                    num_questions += 1
                    years[creation_date.year] += 1

                    ParentId = -1
                    TimeToAnswer = 0
                    q_creation[Id] = creation_date
                    accepted = elem.get('AcceptedAnswerId')
                    if accepted:
                        q_accepted[Id] = int(accepted)
                    IsAccepted = 0

                elif PostTypeId == 2:
                    num_answers += 1

                    ParentId = int(elem.get('ParentId'))
                    if not ParentId in q_creation:
                        # question was too far in the past
                        continue

                    TimeToAnswer = (creation_date - q_creation[ParentId]).seconds

                    if ParentId in q_accepted:
                        IsAccepted = int(q_accepted[ParentId] == Id)
                    else:
                        IsAccepted = 0

                    meta[ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

                else:
                    continue

                Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = filter_html(elem.get('Body'))

                values = (Id, ParentId,
                          IsAccepted,
                          TimeToAnswer, Score,
                          Text.encode("utf-8"),
                          NumTextTokens, NumCodeLines, LinkCount, NumImages)

                yield values

                root.clear()  # preserve memory
        
    print(counter)

if any(not os.path.exists(fn) for fn in [filename_filtered, filename_filtered_meta]):
    with open(filename_filtered, "w") as f:
        for values in parsexml(filename):
            line = "\t".join(map(str, values))
            f.write(line + "\n")
    with open(filename_filtered_meta, "w") as f:
        json.dump(meta, f)
    
    print("years:", years)
    print("#qestions: %i" % num_questions)
    print("#answers: %i" % num_answers)
    
else:
    print("Skipping the conversion step, using %s" % ' and '.join([filename_filtered, filename_filtered_meta]))
    filtered_meta = json.load(open(filename_filtered_meta, "r"))

Reading from xml F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\posts.xml
Filtered: F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered.tsv
Meta: F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered-meta.json
Skipping the conversion step, using F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered.tsv and F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered-meta.json


Now we need to grab a useful sample.

In [None]:
def data(filename, col=None):
    for line in open(filename, "r"):
        data = line.strip().split("\t")

        # check format
        Id, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = data

        if col:
            yield data[col]
        else:
            yield data

posts_to_keep = set()
found_questions = 0

num_qestion_sample = 1000

# Using filter_method, a couple of different sampling methods can be specified.
# "negative_positive": keep the best and worst, but only if we have one with 
#                      positive and one with negative score
# "only_one_per_class": only keep the lowest scoring answer per class in addition to the accepted one
# "sample_per_question": if not None, specifies the number of unaccepted per question
# "half-half": equal share of questions that are unanswered and those that are answered

MaxAnswersPerQuestions = 10  # filter_method == "sample_per_question"

unaccepted_scores = {}

has_q_accepted_a = {}
num_q_with_accepted_a = 0
num_q_without_accepted_a = 0

for ParentId, posts in tqdm(filtered_meta.items()):
    assert ParentId != -1

    if len(posts) < 2:
        continue

    ParentId = int(ParentId)
    AllIds = set([ParentId])
    AcceptedId = None
    UnacceptedId = None
    UnacceptedIds = []
    UnacceptedScore = sys.maxsize

    NegativeScoreIds = []
    PositiveScoreIds = []

    if filter_method == "half-half":

        has_accepted_a = False
        for post in posts:
            Id, IsAccepted, TimeToAnswer, Score = post

            if IsAccepted:
                has_accepted_a = True
                break

        has_q_accepted_a[ParentId] = has_accepted_a

        if has_accepted_a:
            if num_q_with_accepted_a < num_qestion_sample / 2:
                num_q_with_accepted_a += 1
                posts_to_keep.add(ParentId)
        else:
            if num_q_without_accepted_a < num_qestion_sample / 2:
                num_q_without_accepted_a += 1
                posts_to_keep.add(ParentId)

        if num_q_without_accepted_a + num_q_with_accepted_a > num_qestion_sample:
            assert -1 not in posts_to_keep
            break

    else:

        for post in posts:
            Id, IsAccepted, TimeToAnswer, Score = post

            if filter_method == "all":
                AllIds.add(int(Id))

            elif filter_method == "only_one_per_class":
                if IsAccepted:
                    AcceptedId = Id
                elif Score < UnacceptedScore:
                    UnacceptedScore = Score
                    UnacceptedId = Id

            elif filter_method == "sample_per_question":
                if IsAccepted:
                    AcceptedId = Id
                else:
                    UnacceptedIds.append(Id)

            elif filter_method == "negative_positive":
                if Score < 0:
                    NegativeScoreIds.append((Score, Id))
                elif Score > 0:
                    PositiveScoreIds.append((Score, Id))

            else:
                raise ValueError(filter_method)

        added = False
        if filter_method == "all":
            posts_to_keep.update(AllIds)
            added = True
        elif filter_method == "only_one_per_class":
            if AcceptedId is not None and UnacceptedId is not None:
                posts_to_keep.add(ParentId)
                posts_to_keep.add(AcceptedId)
                posts_to_keep.add(UnacceptedId)
                added = True

        elif filter_method == "sample_per_question":
            if AcceptedId is not None and UnacceptedIds is not None:
                posts_to_keep.add(ParentId)
                posts_to_keep.add(AcceptedId)
                posts_to_keep.update(UnacceptedIds[:MaxAnswersPerQuestions])
                added = True

        elif filter_method == "negative_positive":
            if PositiveScoreIds and NegativeScoreIds:
                posts_to_keep.add(ParentId)

                posScore, posId = sorted(PositiveScoreIds)[-1]
                posts_to_keep.add(posId)

                negScore, negId = sorted(NegativeScoreIds)[0]
                posts_to_keep.add(negId)
                print("%i: %i/%i %i/%i" % (ParentId, posId,
                      posScore, negId, negScore))
                added = True

        if added:
            found_questions += 1

    if num_qestion_sample and found_questions >= num_qestion_sample:
        break

total = 0
kept = 0

already_written = set()
chosen_meta_dict = defaultdict(dict)

with open(chosen, "w") as f:
    for line in tqdm(data(filtered)):
        strId, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = line
        Text = Text.strip()

        total += 1

        Id = int(strId)
        if Id in posts_to_keep:
            if Id in already_written:
                print(Id, "is already written")
                continue

            if kept % 100 == 0:
                print(kept)

            # setting meta info
            post = chosen_meta_dict[Id]
            post['ParentId'] = int(ParentId)
            post['IsAccepted'] = int(IsAccepted)
            post['TimeToAnswer'] = int(TimeToAnswer)
            post['Score'] = int(Score)
            post['NumTextTokens'] = int(NumTextTokens)
            post['NumCodeLines'] = int(NumCodeLines)
            post['LinkCount'] = int(LinkCount)
            post['NumImages'] = int(NumImages)
            post['idx'] = kept  # index into the file

            if int(ParentId) == -1:
                q = chosen_meta_dict[Id]

                if not 'Answers' in q:
                    q['Answers'] = []

                if filter_method == "half-half":
                    q['HasAcceptedAnswer'] = has_q_accepted_a[Id]

            else:
                q = chosen_meta_dict[int(ParentId)]

                if int(IsAccepted) == 1:
                    assert 'HasAcceptedAnswer' not in q
                    q['HasAcceptedAnswer'] = True

                if 'Answers' not in q:
                    q['Answers'] = [Id]
                else:
                    q['Answers'].append(Id)

            f.writelines("%s\t%s\n" % (Id, Text))
            kept += 1

with open(chosen_meta, "w") as fm:
    json.dump(chosen_meta_dict, fm)

print("total=", total)
print("kept=", kept)

9: 1404/1509 24161692/-2
48: 31910/127 2452/-3
174: 768/28 108827/-1
289: 1332/463 11630804/-2
622: 623/41 10492469/-1
657: 669/29 46930302/-1
683: 57833/35 735/-1
746: 747/16 751/-1
609: 759/8 307235/-1
826: 97294/11 834/-4
879: 885/194 880/-6
972: 2982/688 984/-7
1005: 37042/270 4311142/-7
1304: 3202085/157 34437734/-1
1329: 1331/57 4061622/-3
1376: 28034/10 417427/-1
1476: 13107/231 1478/-1
1762: 1771/38 23702630/-1
1854: 1857/449 34014856/-4
2027: 2028/31 4685077/-1
2123: 2133/41 2184/-2
2134: 390627/131 1804953/-8
2232: 2400/1067 26369473/-1
2349: 2360/23 2376/-2
2120: 2382/125 2284546/-3
2488: 2495/328 2517/-1
2481: 33692/3 500734/-2
2509: 2548/91 23767371/-1
2525: 5083/48 18663339/-2
2530: 2531/1976 50501/-13
2524: 151931/16 15617688/-1
2647: 2685/323 17203119/-1
2658: 2668/80 26150/-1
2688: 2741/7 2696/-1
2785: 2839/60 2794/-1
2861: 2866/53 4321549/-1
2970: 5448/14 13221/-1
3033: 3360/151 4489750/-2
3284: 3294/142 16019866/-9
3319: 3321/126 33646923/-1
3385: 3390/37 24180505/-1

26877: 26884/25 26895/-1
26825: 26912/3 28340499/-1
26947: 103554/47 26951/-2
27009: 29545/13 378439/-1
27220: 27296/93 41757896/-1
27258: 27270/6 27275/-1
27267: 9015858/141 4884006/-1
27407: 114673/8 3765184/-3
27435: 27440/117 103326/-2
27509: 416327/2184 27510/-14
27578: 19061110/146 27591/-12
26842: 30538/3 27602/-1
27745: 27755/120 45708666/-1
27758: 29068/16 27766/-1
27894: 64891/353 33367049/-2
28150: 43746/5 28191/-1
28212: 28240/4 28239/-1
28542: 293385/7 28569/-2
28713: 28728/13 17811701/-1
28739: 447740/3 28752/-1
20696: 37438/4 28801/-1
28894: 28947/21 28907/-6
29044: 29205/118 682909/-1
29061: 29083/26 29097/-2
29099: 29274/26 29217/-2
29142: 29172/224 6464547/-2
29157: 13484101/21 29215/-1
29311: 72080/4 29339/-1
29383: 29571/97 29396/-7
28029: 10627752/3 29442/-4
29699: 29727/30 29703/-2
28796: 29770/47 14046877/-4
29624: 194969/7 29771/-1
29775: 29830/6 29872/-1
29869: 29985/37 29911/-1
29890: 108075/3 29912/-1
29943: 29966/67 8797741/-1
29988: 30001/8 30007/-1
30152: 

50096: 13955428/280 22007042/-8
50389: 50401/7 5859247/-7
50450: 50459/7 50569/-1
50499: 1955163/390 14493064/-1
50467: 2487402/563 27625246/-3
50605: 50632/171 3202886/-14
50652: 51018/1 50658/-1
50744: 3677960/47 1171540/-3
50769: 50988/14 73242/-1
50831: 204085/2 50857/-2
50864: 50875/7 51204/-1
50945: 50980/216 50955/-10
50954: 50960/7 50959/-1
51010: 51042/16 263451/-3
50995: 796938/7 51312/-2
51185: 4717855/231 33127871/-2
51224: 51231/121 221748/-1
51264: 51285/5 51265/-1
51276: 51558/4 18665561/-2
51288: 51295/10 51371/-2
51320: 51331/37 51327/-1
51352: 52597/39 51384/-1
51470: 93633/121 22639704/-1
51502: 52090/23 52111/-1
51520: 51523/585 43691204/-2
51564: 51573/2 51681/-1
51582: 51623/26 765241/-1
51589: 7619803/34 17868210/-1
51574: 52691/96 2980129/-2
51572: 901660/57 51616/-2
51684: 51820/6 51789/-3
51837: 51875/9 51861/-1
51464: 99664/7 51947/-2
51927: 62082/184 39934815/-1
51964: 1632995/10 51987/-1
52080: 52101/24 52083/-1
52103: 52106/21 2642415/-1
51262: 117079/4 52

69497: 69555/25 79158/-1
69188: 69563/7 69535/-2
69565: 69600/5 69588/-1
69591: 69615/38 69593/-2
69627: 69639/13 69631/-2
69637: 69677/2 69709/-1
69843: 69988/42 69851/-1
69979: 69995/5 70025/-1
70013: 70630/447 70021/-14
69645: 782768/59 20219666/-3
70143: 70591/15 1031921/-1
70161: 653098/42 654386/-2
70272: 70296/94 17342392/-2
70303: 70375/59 70512/-1
70232: 434205/3 78440/-3
70405: 70436/110 70463/-2
70402: 90477/211 79149/-1
70455: 70525/32 39628957/-1
70453: 70507/27 70571/-4
70471: 70535/59 18496772/-3
70537: 71087/20 46753756/-1
70579: 79022/1386 25827288/-5
70575: 70634/8 11664488/-1
70529: 70657/155 8073228/-1
70653: 80008/10 10495626/-1
70689: 71399/685 19849576/-5
70781: 70810/7 155468/-1
70785: 84955/6 70920/-1
70947: 70976/130 10434668/-2
67685: 779513/4 107523/-2
70855: 15501449/319 19151582/-2
71022: 6871572/606 31914496/-3
71077: 71099/12 71101/-1
70992: 71499/14 74284/-4
71180: 71310/39 71197/-2
71074: 199319/594 37717454/-1
71254: 277661/70 71717/-2
71328: 71444/58

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800


In [None]:
from tqdm import tqdm

In [25]:
pbar = tqdm(range(100), file=sys.stdout)
for char in pbar:
    pbar.set_description("Processing %s" % char)
    pbar.update(1)

Processing 99: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 159.23it/s]


In [22]:
pbar.update?

In [24]:

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)