# Building Machine Learning Systems with Python - Chapter 4

This code is supporting material for the book `Building Machine Learning Systems with Python` by [Willi Richert](https://www.linkedin.com/in/willirichert/) and [Luis Pedro Coelho](https://www.linkedin.com/in/luispedrocoelho/)  published by PACKT Publishing.

It is made available under the MIT License.

All code examples use Python in version...

In [1]:
import sys
sys.version

'3.6.3 |Anaconda custom (64-bit)| (default, Nov  8 2017, 15:10:56) [MSC v.1900 64 bit (AMD64)]'

In [3]:
# Use all screen real-estate
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option('display.max_colwidth', -1)

# Downloading the data
In this chapter we will use the StackOverflow data from https://archive.org/download/stackexchange (while downloading, you have a couple hours time to contemplate whether now would be a good time to donate to the awesome archive.org :-) )
Since it is updated on a regular basis, you might get slightly different numbers. In this chapter we use this version:
```
stackoverflow.com-Posts.7z                        08-Dec-2017 22:31     11.3G
```
After downloading it, you need to unzip it with [7-Zip](http://www.7-zip.de/download.html).

# Extracting and filtering it

In [16]:
import os
import re

# TODO change before merging to master
#DATA_DIR = "data"  # put your posts-2012.xml into this directory
DATA_DIR = r'F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts'
CHART_DIR = "charts"
if not os.path.exists(CHART_DIR):
    os.mkdir(CHART_DIR)

fn_posts_all = os.path.join(DATA_DIR, "posts.xml")
fn_posts = os.path.join(DATA_DIR, "posts-2012.xml")
print("Reading from xml %s" % fn_posts)

fn_filtered = os.path.join(DATA_DIR, "filtered.tsv")
print("Filtered: %s" % fn_filtered)
fn_filtered_meta = os.path.join(DATA_DIR, "filtered-meta.json")
print("Meta: %s" % fn_filtered_meta)

fn_chosen = os.path.join(DATA_DIR, "chosen.tsv")
fn_chosen_meta = os.path.join(DATA_DIR, "chosen-meta.json")

Reading from xml F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\posts-2012.xml
Filtered: F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered.tsv
Meta: F:\Stack Exchange Data Dump - Dec 2017\stackoverflow.com-Posts\filtered-meta.json


The 59GB in posts.xml is contain posts from 2008 to 2017. We will use only the posts from the year 2012, which provides enough fun for now. We could simply grep on the command line, but that would take quite a while.

In [32]:
year_match = re.compile(r'^\s+<row [^>]*CreationDate="(\d+)-')
size = os.path.getsize(fn_posts_all)

def get_year(line):
    m = year_match.match(line)
    if m is None:
        import pdb;pdb.set_trace()
    return int(m.group(1))

YEAR = 2012

with open(fn_posts_all, 'r', encoding='utf-8') as fa, open(fn_posts, 'w', encoding='utf-8') as f2012:
    # first two lines are the xml header and <posts> tag
    f2012.write('<?xml version="1.0" encoding="utf-8"?><posts>\n')    
    
    right = size//2
    delta = right
    
    # first find some post of YEAR
    while True:
        fa.seek(right)
        fa.readline() # go to next newline
        line = fa.readline()
        
        year = get_year(line)
        
        delta //= 2
        assert delta > 0
        
        if year>YEAR:
            right -= delta
        elif year<YEAR:
            right += delta
        else:
            break
            
    # then find where it starts
    left = right//2
    delta = left
    while True:
        fa.seek(left)
        fa.readline() # go to next newline
        line = fa.readline()
        
        year = get_year(line)
        
        delta //= 2
        if delta == 0:
            break
        
        if year<YEAR:
            left += delta
            
        else:
            left, right = left-delta, left
    
    # and write all posts of that year
    while True:
        line = fa.readline()
        year = get_year(line)
        if year == YEAR:
            f2012.write(line)
        elif year > YEAR:
            break
        
        
    # and write the closing tag
    f2012.write('</posts>\n')

In [None]:
from dateutil import parser as dateparser

from operator import itemgetter
from collections import defaultdict
from lxml import etree

from tqdm import tqdm_notebook as tqdm # we all love nice progress bars, don't we?

try:
    import ujson as json  # UltraJSON if available
except:
    print("You can also use the normal json module, but you get a XXX speedup if you use ujson instead.")
    raise

q_creation = {}  # creation datetimes of questions
q_accepted = {}  # id of accepted answer

NUM_ROWS = 4511696 # counted by hand

meta = {
    'question': defaultdict(list), # question -> [(answer Id, IsAccepted, TimeToAnswer, Score), ...]
    'total': 0 # questions and answers finally written
}

# Regular expressions to find code snippets, links, images, and tags, which might help in 
# designing useful features
code_match = re.compile('<pre>(.*?)</pre>', re.MULTILINE | re.DOTALL)
link_match = re.compile('<a href="http://.*?".*?>(.*?)</a>', re.MULTILINE | re.DOTALL)
img_match = re.compile('<img(.*?)/>', re.MULTILINE | re.DOTALL)
tag_match = re.compile('<[^>]*>', re.MULTILINE | re.DOTALL)
whitespace_match = re.compile(r'\s+', re.MULTILINE | re.DOTALL)

def extract_features_from_body(s):
    '''
    This method creates features from the raw post. It already contains all 
    features that we will use throughout the chapter.
    '''
    num_code_lines = 0
    link_count_in_code = 0
    code_free_s = s

    num_images = len(img_match.findall(s))

    # remove source code and count how many lines
    for match_str in code_match.findall(s):
        num_code_lines += match_str.count('\n')
        code_free_s = code_match.sub(' ', code_free_s)

        # sometimes source code contain links, which we don't want to count
        link_count_in_code += len(link_match.findall(match_str))

    links = link_match.findall(s)
    link_count = len(links) - link_count_in_code

    html_free_s = tag_match.sub(' ', code_free_s)
    link_free_s = html_free_s
    
    for link in links:
        if link.lower().startswith('http://'):
            link_free_s = link_free_s.replace(link, ' ')

    whitespace_cleaned_s = whitespace_match.sub(' ', link_free_s)
    num_text_tokens = whitespace_cleaned_s.count(' ')

    return link_free_s, num_text_tokens, num_code_lines, link_count, num_images

years = defaultdict(int)
num_questions = 0
num_answers = 0

def parsexml(fn):
    global num_questions, num_answers

    counter = 0

    # etree.iterparse() returns a tuple (event, element). Since we request only
    # 'start' events, we pipe the result through an itemgetter that always returns
    # the 2nd result.
    it = map(itemgetter(1), etree.iterparse(fn, events=('start',)))
    
    # Get the <posts> element, in which we will parse the <row> elements. While doing so,
    # we will need the root handle to clear memory
    root = next(it)
    
    for counter, elem in enumerate(tqdm(it, total=NUM_ROWS)):
        
        if elem.tag != 'row':
            continue
            
        creation_date = dateparser.parse(elem.get('CreationDate'))
        
        # to speed up our journey, we restrict the data to posts in 2012
        if creation_date.year != 2012:
            continue

        Id = int(elem.get('Id'))
        PostTypeId = int(elem.get('PostTypeId'))
        Score = int(elem.get('Score'))

        if PostTypeId == 1:
            num_questions += 1
            years[creation_date.year] += 1

            ParentId = -1
            TimeToAnswer = 0
            q_creation[Id] = creation_date
            accepted = elem.get('AcceptedAnswerId')
            if accepted:
                q_accepted[Id] = int(accepted)
            IsAccepted = 0

        elif PostTypeId == 2:
            num_answers += 1

            ParentId = int(elem.get('ParentId'))
            if not ParentId in q_creation:
                # question was too far in the past
                continue

            TimeToAnswer = (creation_date - q_creation[ParentId]).seconds

            if ParentId in q_accepted:
                IsAccepted = int(q_accepted[ParentId] == Id)
            else:
                IsAccepted = 0

            meta['question'][ParentId].append((Id, IsAccepted, TimeToAnswer, Score))

        else:
            continue

        Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = extract_features_from_body(elem.get('Body'))

        # https://www.ibm.com/developerworks/xml/library/x-hiperfparse/
        elem.clear()
        while elem.getprevious() is not None:
            del elem.getparent()[0]
            
        values = (Id, ParentId,
                  IsAccepted,
                  TimeToAnswer, Score,
                  Text.encode("utf-8"),
                  NumTextTokens, NumCodeLines, LinkCount, NumImages)

        yield values


    print("Found %i posts" % counter)

if any(not os.path.exists(fn) for fn in [fn_filtered, fn_filtered_meta]):
    total = 0
    with open(fn_filtered, "w") as f:
        for values in parsexml(fn_posts):
            line = "\t".join(map(str, values))
            f.write(line + "\n")
            total += 1
    meta['total'] = total
                
    with open(fn_filtered_meta, "w") as f:
        json.dump(meta, f)
    
    print("years:", years)
    print("#qestions: %i" % num_questions)
    print("#answers: %i" % num_answers)
    
else:
    print("Skipping the conversion step, loading data from %s ..." % fn_filtered_meta)
    filtered_meta = json.load(open(fn_filtered_meta, "r"))
    print("... done!")

Now we need to select the answers that we want to keep per question. We do this in two stages:
 * Stage 1: Select which answers to keep per question (`filter_method` lets you chose among different methods)
 * Stage 2: Filter the previously stored features according to the answers selected in stage 1

In [7]:
# In the book chapter we will use "negative_positive".
#
# "negative_positive":   keep the best and worst, but only if we have one with 
#                        positive and one with negative score
# "only_one_per_class":  only keep the lowest scoring answer per class in addition to the 
#                        accepted one
# "sample_per_question": if not None, specifies the number of unaccepted per question
# "half-half":           equal share of questions that are unanswered and those that are 
#                        answered

filter_method = "negative_positive"

MAX_ANSWERS_PER_QUESTIONS = 10  # used by filter_method "sample_per_question"

NUM_QUESTION_SAMPLE = 0

posts_to_keep = set()
found_questions = 0

unaccepted_scores = {}

has_q_accepted_a = {}
num_q_with_accepted_a = 0
num_q_without_accepted_a = 0

question = filtered_meta['question']
for ParentId, posts in tqdm(question.items(), total=len(question), desc="Stage 1:"):
    assert ParentId != -1

    if len(posts) < 2:
        continue

    ParentId = int(ParentId)
    AllIds = set([ParentId])
    AcceptedId = None
    UnacceptedId = None
    UnacceptedIds = []
    UnacceptedScore = sys.maxsize

    NegativeScoreIds = []
    PositiveScoreIds = []

    if filter_method == "half-half":

        has_accepted_a = False
        for post in posts:
            Id, IsAccepted, TimeToAnswer, Score = post

            if IsAccepted:
                has_accepted_a = True
                break

        has_q_accepted_a[ParentId] = has_accepted_a

        if has_accepted_a:
            if num_q_with_accepted_a < NUM_QUESTION_SAMPLE / 2:
                num_q_with_accepted_a += 1
                posts_to_keep.add(ParentId)
        else:
            if num_q_without_accepted_a < NUM_QUESTION_SAMPLE / 2:
                num_q_without_accepted_a += 1
                posts_to_keep.add(ParentId)

        if num_q_without_accepted_a + num_q_with_accepted_a > NUM_QUESTION_SAMPLE:
            assert -1 not in posts_to_keep
            break

    else:

        for post in posts:
            Id, IsAccepted, TimeToAnswer, Score = post

            if filter_method == "all":
                AllIds.add(int(Id))

            elif filter_method == "only_one_per_class":
                if IsAccepted:
                    AcceptedId = Id
                elif Score < UnacceptedScore:
                    UnacceptedScore = Score
                    UnacceptedId = Id

            elif filter_method == "sample_per_question":
                if IsAccepted:
                    AcceptedId = Id
                else:
                    UnacceptedIds.append(Id)

            elif filter_method == "negative_positive":
                if Score <= 0:
                    NegativeScoreIds.append((Score, Id))
                elif Score > 0:
                    PositiveScoreIds.append((Score, Id))

            else:
                raise ValueError(filter_method)

        added = False
        if filter_method == "all":
            posts_to_keep.update(AllIds)
            added = True
            
        elif filter_method == "only_one_per_class":
            if AcceptedId is not None and UnacceptedId is not None:
                posts_to_keep.add(ParentId)
                posts_to_keep.add(AcceptedId)
                posts_to_keep.add(UnacceptedId)
                added = True

        elif filter_method == "sample_per_question":
            if AcceptedId is not None and UnacceptedIds is not None:
                posts_to_keep.add(ParentId)
                posts_to_keep.add(AcceptedId)
                posts_to_keep.update(UnacceptedIds[:MAX_ANSWERS_PER_QUESTIONS])
                added = True

        elif filter_method == "negative_positive":
            if PositiveScoreIds and NegativeScoreIds:
                posts_to_keep.add(ParentId)

                posScore, posId = sorted(PositiveScoreIds)[-1]
                posts_to_keep.add(posId)

                negScore, negId = sorted(NegativeScoreIds)[0]
                posts_to_keep.add(negId)
                
                added = True

        if added:
            found_questions += 1

    if NUM_QUESTION_SAMPLE and found_questions >= NUM_QUESTION_SAMPLE:
        print("Using only a sample of %i questions" % NUM_QUESTION_SAMPLE)
        break




read: 0
kept: 0


In [9]:
already_written = set()
chosen_meta = defaultdict(dict)

with open(fn_chosen, "w") as f:
    for line in data(filtered):
        strId, ParentId, IsAccepted, TimeToAnswer, Score, Text, NumTextTokens, NumCodeLines, LinkCount, NumImages = line
        Text = Text.strip()

        total += 1

        Id = int(strId)
        if Id in posts_to_keep:
            if Id in already_written:
                print(Id, "is already written")
                continue

            if kept % 100 == 0:
                print(kept)

            # setting meta info
            post = chosen_meta_dict[Id]
            post['ParentId'] = int(ParentId)
            post['IsAccepted'] = int(IsAccepted)
            post['TimeToAnswer'] = int(TimeToAnswer)
            post['Score'] = int(Score)
            post['NumTextTokens'] = int(NumTextTokens)
            post['NumCodeLines'] = int(NumCodeLines)
            post['LinkCount'] = int(LinkCount)
            post['MisSpelledFraction'] = misspelled_fraction(Text)
            post['NumImages'] = int(NumImages)
            post['idx'] = kept  # index into the file

            if int(ParentId) == -1:
                q = chosen_meta_dict[Id]

                if not 'Answers' in q:
                    q['Answers'] = []

                if filter_method == "half-half":
                    q['HasAcceptedAnswer'] = has_q_accepted_a[Id]

            else:
                q = chosen_meta_dict[int(ParentId)]

                if int(IsAccepted) == 1:
                    assert 'HasAcceptedAnswer' not in q
                    q['HasAcceptedAnswer'] = True

                if 'Answers' not in q:
                    q['Answers'] = [Id]
                else:
                    q['Answers'].append(Id)

            f.writelines("%s\t%s\n" % (Id, Text))
            kept += 1

with open(fn_chosen_meta, "w") as fm:
    json.dump(chosen_meta, fm)

print("read:", total)
print("kept:", kept)


0

## Utility functions to access the sampled data

In [8]:
def load_meta(filename):
    meta = json.load(open(filename, "r"))
    keys = list(meta.keys())

    # JSON only allows string keys, changing that to int
    for key in keys:
        meta[int(key)] = meta[key]
        del meta[key]

    # map post Id to index in vectorized
    id_to_idx = {}
    # and back
    idx_to_id = {}

    for PostId, Info in meta.items():
        id_to_idx[PostId] = idx = Info['idx']
        idx_to_id[idx] = PostId

    return meta, id_to_idx, idx_to_id

meta, id_to_idx, idx_to_id = load_meta(fn_chosen_meta)

# Loading the features and labeling them

In [None]:
all_questions = sorted([q for q, v in meta.items() if v['ParentId'] == -1])
all_answers = sorted([q for q, v in meta.items() if v['ParentId'] != -1])

An answer is labeled as positive if it has a score greater than zero.

In [None]:
Y_orig = np.asarray([meta[aid]['Score'] > 0 for aid in all_answers])

# Creating our first classifier: kNN using only LinkCount as a feature

## kNN

In [4]:
from sklearn import neighbors 
knn = neighbors.KNeighborsClassifier(n_neighbors=2) 
print(knn)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=2, p=2,
           weights='uniform')

In [None]:
# toy training data: map {1,2,3} to 0 and {4,5,6} to 1
knn.fit([[1],[2],[3],[4],[5],[6]], [0,0,0,1,1,1])
knn.predict(1.5) 

In [None]:
knn.predict(37) 

In [None]:
knn.predict(3) 

In [None]:
knn.predict_proba(1.5)

In [None]:
knn.predict_proba(37)

In [None]:
knn.predict_proba(3.5)

## Train a model using only LinkCount

So how good is `LinkCount`? Let's look at its histogram.

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

X = np.asarray([[meta[aid]['LinkCount']] for aid in all_answers])

plt.figure(figsize=(5,4), dpi=300) # width and height of the plot in inches

plt.title('LinkCount')
plt.xlabel('Value')
plt.ylabel('Occurrence')

n, bins, patches = plt.hist(X, bins=50, normed=True, alpha=0.75)
plt.xlim(0, 50)

plt.grid(True)
plt.savefig(os.path.join(CHART_DIR, 'feat_hist_linkcount.png'), bbox_inches="tight")

Ok, so most posts don't contain a link at all, but let's try nevertheless...

## Training on LinkCount

In [None]:
from sklearn.cross_validation import KFold
scores = []
N_FOLDS = 5
from sklearn.utils import shuffle
X, Y = shuffle(X, Y_orig, random_state=0)
cv = KFold(n=len(X), n_folds=N_FOLDS)

for train, test in tqdm(cv, total=N_FOLDS):    
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X[train], Y[train])
    scores.append(clf.score(X[test], Y[test]))

print("Mean(scores)=%.5f\tStddev(scores)=%.5f"%(np.mean(scores), np.std(scores))) 

# Using more features

In [None]:
def plot_feat_hist(data_name_list, filename=None):
    if len(data_name_list) > 1:
        assert filename is not None

    num_rows = int(1 + (len(data_name_list) - 1) / 2)
    num_cols = int(1 if len(data_name_list) == 1 else 2)
    pylab.figure(figsize=(5 * num_cols, 4 * num_rows), dpi=300)

    for i in range(num_rows):
        for j in range(num_cols):
            pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)
            x, name = data_name_list[i * num_cols + j]
            pylab.title(name)
            pylab.xlabel('Value')
            pylab.ylabel('Occurrence')
            # the histogram of the data
            max_val = np.max(x)
            if max_val <= 1.0:
                bins = 50
            elif max_val > 50:
                bins = 50
            else:
                bins = max_val
            n, bins, patches = pylab.hist(
                x, bins=bins, normed=1, alpha=0.75)

            pylab.grid(True)

    if not filename:
        filename = "feat_hist_%s.png" % name.replace(" ", "_")

    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")


plot_feat_hist([(np.asarray([[meta[aid]['NumCodeLines']] for aid in all_answers]), 'NumCodeLines'),
                (np.asarray([[meta[aid]['NumTextTokens']] for aid in all_answers]), 'NumTextTokens')],
              'feat_hist_CodeLinkes_TextTokens.png');

In [None]:
def get_features(aid, feature_names):
    return tuple(meta[aid][fn] for fn in feature_names)

X = np.asarray([get_features(aid, ['LinkCount', 'NumCodeLines', 'NumTextTokens']) for aid in all_answers])

scores = []
N_FOLDS = 5
from sklearn.utils import shuffle
X, Y = shuffle(X, Y_orig, random_state=0)
cv = KFold(n=len(X), n_folds=N_FOLDS)

for train, test in tqdm(cv, total=N_FOLDS):    
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X[train], Y[train])
    scores.append(clf.score(X[test], Y[test]))

print("Mean(scores)=%.5f\tStddev(scores)=%.5f"%(np.mean(scores), np.std(scores))) 

# If more features are good, even more features should be even better
Let's create some more text based features like average sentence and word length, how many words are CAPITALIZED or contain exclamation marks.

We simply at

In [None]:
X = np.asarray([get_features(aid, ['LinkCount', 'NumCodeLines', 'NumTextTokens', 
                                   'AvgSentLen', 'AvgWordLen', 'NumAllCaps', 
                                   'NumExclams',]) for aid in all_answers])

scores = []
N_FOLDS = 5
from sklearn.utils import shuffle
X, Y = shuffle(X, Y_orig, random_state=0)
cv = KFold(n=len(X), n_folds=N_FOLDS)

for train, test in tqdm(cv, total=N_FOLDS):    
    clf = neighbors.KNeighborsClassifier()
    clf.fit(X[train], Y[train])
    scores.append(clf.score(X[test], Y[test]))

print("Mean(scores)=%.5f\tStddev(scores)=%.5f"%(np.mean(scores), np.std(scores))) 

In [None]:
import nltk

feature_names = np.array((
    'NumTextTokens',
    'NumCodeLines',
    'LinkCount',
    'AvgSentLen',
    'AvgWordLen',
    'NumAllCaps',
    'NumExclams',
    'NumImages'
))

def fetch_posts(fn, with_index=True, line_count=-1):
    count = 0

    for line in open(fn, "r"):
        count += 1
        if line_count > 0 and count > line_count:
            break

        Id, Text = line.split("\t")
        Text = Text.strip()

        if with_index:
            yield int(Id), Text

        else:
            yield Text
            
def prepare_sent_features():
    for pid, text in fetch_posts(chosen, with_index=True):
        if not text:
            meta[pid]['AvgSentLen'] = meta[pid]['AvgWordLen'] = 0
        else:
            from platform import python_version
            if python_version().startswith('2'):
                text = text.decode('utf-8')
            sent_lens = [len(nltk.word_tokenize(
                sent)) for sent in nltk.sent_tokenize(text)]
            meta[pid]['AvgSentLen'] = np.mean(sent_lens)
            meta[pid]['AvgWordLen'] = np.mean(
                [len(w) for w in nltk.word_tokenize(text)])

        meta[pid]['NumAllCaps'] = np.sum(
            [word.isupper() for word in nltk.word_tokenize(text)])

        meta[pid]['NumExclams'] = text.count('!')


prepare_sent_features()

qa_X = np.asarray([get_features(aid) for aid in all_answers])

In [None]:
def plot_feat_hist(data_name_list):
    '''
    Plots feature histograms for all features specified in data_name_list
    '''
    pylab.figure(num=None, figsize=(8, 6))
    num_rows = int(1 + (len(data_name_list) - 1) / 2)
    num_cols = int(1 if len(data_name_list) == 1 else 2)
    pylab.figure(figsize=(5 * num_cols, 4 * num_rows))

    for i in range(num_rows):
        for j in range(num_cols):
            pylab.subplot(num_rows, num_cols, 1 + i * num_cols + j)
            x, name = data_name_list[i * num_cols + j]
            pylab.title(name)
            pylab.xlabel('Value')
            pylab.ylabel('Fraction')
            # the histogram of the data
            max_val = np.max(x)
            if max_val <= 1.0:
                bins = 50
            elif max_val > 50:
                bins = 50
            else:
                bins = max_val
            n, bins, patches = pylab.hist(
                x, bins=bins, normed=1, alpha=0.75)

            pylab.grid(True)

    if len(data_name_list) == 1:
        filename = "feat_hist_%s.png" % name.replace(" ", "_")
    else:
        filename = "featu_hist.png"

    pylab.savefig(os.path.join(CHART_DIR, filename), bbox_inches="tight")
