In [1]:
# Assignment 1_3 Character-Level Language Modeling

# -*- coding: utf-8 -*-
import os
import re
import sys
import math
import random

In [2]:
"""Get all files name under path

Args:
    path: folder path to retrieve files' name.
    ratio: propotion of training data. Default value is 1 (100%).
    suffix: type of files
    shuffle: a boolean value. TRUE: shuffle list; False: order list.

Returns:
    filesName[:train]: a list of all files end with ".txt" for training set. For example:

    ["dir/a.txt", "dir/b.txt"].

    filesName[train:]: a list of all files end with ".txt" for held-out set. For example:

    ["dir/a.txt", "dir/b.txt"].
"""
def getFilesName( path, ratio = 1, suffix = ".txt", shuffle = False ):
    print( "Retrieving files name from folder %s..." % ( path ) )
    filesName = []
    files = os.listdir( path )
    for file in files:
        if os.path.splitext( file )[1] == suffix:
            name = '/'.join( [path, file] )
            filesName.append( name )
    if shuffle:
        random.shuffle( filesName )
    else:
        filesName.sort()
    total = len( filesName )
    train = int( total * ratio )
    return filesName[:train], filesName[train:]

In [3]:
"""Preprocess data

1. Remove blank lines from each file.
2. Replace newline characters with spaces.
3. Remove duplicate spaces.

Args:
    fileName: fileName indicating which file that need to be processed.
    encoding: the encoding of inputing file. Default value is "Latin-1".

Returns:
    content: a string containing processed content from file. For example:
    
    "A cat"
"""
def preprocess( fileName, encoding = "Latin-1" ):
    print( "Preprocessing file %s..." % ( fileName ) )
    content = ""
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        while line:
            line = re.sub( "\s", " ", line )
            line = re.sub( "_", "", line )
            line = re.sub( "[^\w\s]", "", line )
            content += line
            line = f.readline()
        content = re.sub( "\s+", " ", content ).strip()
    return content

"""Preprocess data

4. Replace characters in training set that appear ≤ 5 times as “UNK”.

Note: The function will figure out all characters which are need to be replaced by "UNK"
      and they will be replaced when building n-gram character-level language model.

Args:
    content: string to be processed.

Returns:
    repc: a string of characters that need to be replaced with "UNK". For example:
    
    "[abc]"
"""
def unk( content ):
    d = {}
    for c in content:
        if c not in d:
            d[c] = 0
        d[c] += 1
    repc = []
    for ( k, v ) in d.items():
        if v <= 5:
            repc.append( k )
    repc = '[' + "".join( repc ) + ']'
    return repc

In [4]:
# Problem 3.1
"""Generate n-gram dictionary.

Generate n-gram dictionary based on fed string and n.

Note: replace characters in content with "?" which represents "UNK" in character-level.

Args:
    content: content used to calculate ngrams.
    repc: characters that need to be replaced with "UNK".
    n: n-gram.
    d: dictionary correspond to n-gram.

Returns:
    None.
"""
def ngrams( content, repc, n, d ):
    if len( repc ) > 2:
        content = re.sub( repc, "?", content )
    length = len( content )
    for i in range( 0, length - n + 1 ):
        k = content[i:i + n]
        if k not in d:
            d[k] = 0
        d[k] += 1

"""Build language model

Preprocess files and across all files in the directory (counted together), report the 
unigram, bigram, and trigram character counts.

Args:
    trainFiles: a list contains all training files' name.
    encoding: train data files' encoding.

Returns:
    lm: a dictionary of language model when savePath equals empty string. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.
"""
def LM( trainFiles, encoding = "Latin-1"  ):
    print( "Building language modeling..." )
    lm = {"unigram": {"c": {}, "t": 0},
          "bigram" : {"c": {}, "t": 0},
          "trigram": {"c": {}, "t": 0}}
    ngram = ["unigram", "bigram", "trigram"]
    
    # preprocess data and find UNK
    print( "Counting for finding UNK.")
    content = ""
    for fileName in trainFiles:
        content += preprocess( fileName, encoding )
    repc = unk( content )
    
    # Calculate unigram, bigram, and trigram
    print( "Calculating n-grams..." )
    for fileName in trainFiles:
        content = preprocess( fileName, encoding )
        ngrams( content, repc, 1, lm["unigram"]["c"] )
        ngrams( content, repc, 2, lm["bigram" ]["c"] )
        ngrams( content, repc, 3, lm["trigram"]["c"] )
    for name in ngram:
        lm[name]["t"] = sum( lm[name]["c"].values() )
    return lm

"""Main function of problem 3.1

Across all files in the directory (counted together), report the unigram, bigram, and trigram
character counts and save them in seperate files.

Args:
    trainDataPath: train data path.
    encoding: train data files' encoding
    savePath: path to save language model.
    ratio: the proportion of the real training set comparing to whole training set

Returns:
    None
"""
def buildLM( trainDataPath = "./gutenberg", encoding = "Latin-1", savePath = "./lm", ratio = 1 ):
    ngram = ["unigram", "bigram", "trigram"]
    trainFiles, heldOutFiles = getFilesName( trainDataPath )
    lm = LM( trainFiles, encoding = encoding )
    for name in ngram:
        with open( savePath + "/" + name, "w" ) as f:
            f.write( str( lm[name]["t"] ) + "\n" )
            for ( k, v ) in lm[name]["c"].items():
                f.write( k + " " + str( v ) + "\n" )

In [5]:
# Problem 3.2
"""Linear Interplotation Smoothing

P(w_{n}|w_{n-2}w_{n-1}) = lambda3 * P(w_{n}|w_{n-2}w_{n-1}) +
                          lambda2 * P(w_{n}|w_{n-1}) +
                          lambda1 * P(w_{n})
    where lambda1 + lambda2 + lambda3 = 1.

Args:
    lm: a dictionary contains language model. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.

    lambdas: a dictionary of lambda for interplotation or addLambda. Its structure is:
    
    {1: lambdaForUnigram, 2: lambdaForBigram, 3: lambdaForTrigram}.

    s: string wating for calculating unigram, bigram, and trigram.

Returns:
    p: a double number represents the final probability of P(w_{n}|w_{n-2}w_{n-1}).
"""
def interplotation( lm, lambdas, s ):
    s1 = s[2:]
    s2 = s[1:]
    s3 = s[0:]
    if s1 not in lm["unigram"]["c"]:
        p1 = lm["unigram"]["c"]["?"] / lm["unigram"]["t"]
    else:
        p1 = lm["unigram"]["c"][s1] / lm["unigram"]["t"]
    if s2 not in lm["bigram"]["c"] or s1 not in lm["unigram"]["c"]:
        p2 = 0
    else:
        p2 = lm["bigram" ]["c"][s2] / lm["unigram" ]["c"][s1]
    if s3 not in lm["trigram"]["c"] or s2 not in lm["bigram"]["c"]:
        p3 = 0
    else:
        p3 = lm["trigram"]["c"][s3] / lm["bigram"]["c"][s2]
    p = lambdas[1] * p1 + lambdas[2] * p2 + lambdas[3] * p3
    return p

"""Calculate perplexity

PP(W) = P(w_1w_2 ... w_n)^(-1/n)
      = 2^{-1 / n * sum_{i=1:n}(log2(LM(w_i|w_{i-2}w_{i-1})))}

Note: Since here is no <SOS> and <EOS> in language model, n would be the length of
      the content - 2.

Args:
    content: string content.
    lm: a dictionary contains language model. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.

    **kwargs:
        func: smoothing function name on calculating P(w_i|w_{i-2}w_{i-1}), including
              func = "Interplotation" and func = "AddLambda".
        
        lambdas: a dictionary of lambda for interplotation or addLambda. Its structure is:
    
        {1: lambdaForUnigram, 2:lambdaForBigram, 3:lambdaForTrigram}
        
        When using addLambda function, only need to feed one specific lambda.
    
Returns:
    ppw: a double number represents the perplexity of the content.

Raise:
    KeyError: an error when trying to find smoothing function.
"""
def perplexity( content, lm, **kwargs ):
    length = len( content )
    log2p = 0
    if( length <= 2 ):
        raise Exception( "Too short content." )
    if "func" in kwargs:
        if kwargs["func"] == "Interplotation":
            for i in range( length - 1 ):
                p = interplotation( lm, kwargs["lambdas"], content[i:i + 3] )
                log2p += math.log2( p )
        elif kwargs["func"] == "AddLambda":
            for i in range( length - 1 ):
                p = addLambda( lm, kwargs["lambdas"], content[i:i + 3] )
                log2p += math.log2( p )
        else:
            raise Exception( "Cannot find the smoothing function." )
    else:
        raise Exception( "No smoothing function." )
    log2p *= -1 / ( length - 2 )
    ppw = 2 ** log2p
    return ppw

"""Grid search

Using grid search and held-out data set find the best lambdas for
linear interplotation smoothing.

Args:
    lambdas: a generator of lambdas which generate a dictionary of lambdas
             for unigram, bigram, trigram each time. For example:
    
    {1:0.1, 2:0.1, 3:0.8}
    
    lm: a dictionary of language model. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.
    
    heldOutFiles: files name belongs to held-out data set.

Returns:
    lambdas: the best lambdas combination.
"""
def gridSearch( lambdas, lm, heldOutFiles ):
    print( "Applying grid search..." )
    minAvg = float( "inf" )
    for lambd in lambdas:
        avg = 0
        for name in heldOutFiles:
            content = preprocess( name )
            avg += perplexity( content, lm, func = "Interplotation", lambdas = lambd )
        if( avg < minAvg ):
            minAvg = avg
            bestLambda = lambd
    return bestLambda

"""Main function for Problem 3.2

Calculate the perplexity for each le in the test set using linear interpolation smoothing
method.

Note: Here I sperate the training data and held-out data by seperating number of files rather
      than seperating all content after concatenating all together and then dividing them.
      
      There are two reasons to divide data in this way
      1. It is hard and cumbersome to measure 80% of the content just on name list.
      2. If loading all content into memory at the same time, it is too time consuming and
         wastes time without obviously improvment on final language model.

Args:
    trainDataPath: train data path.
    encoding: train data files' encoding
    savePath: path to save language model. If it equals to empty string, the function returns
              language model.
    testDataPath: test data path.
    ratio: the proportion of the real training set comparing to whole training set.

Returns:
    None.
"""
def interplotationPPW( trainDataPath = "./gutenberg", encoding = "Latin-1", savePath = "./3_2",
                       testDataPath = "./test_data", ratio = 0.8 ):
    # Get new language model
    trainFiles, heldOutFiles = getFilesName( trainDataPath, ratio = ratio, shuffle = True )
    lm = LM( trainFiles, encoding = encoding )

    # Choose lambdas by grid search and perplexity
    lambdas = ( {1: x / 10, 2: y / 10, 3: ( 10 - x - y ) / 10}
                   for x in range( 1, 10, 1 ) for y in range( 1, 10 - x, 1 ) )
    lambdas = gridSearch( lambdas, lm, heldOutFiles )

    # File-PPW pair dictionary
    dfp = {}
    filesName, _ = getFilesName( testDataPath, suffix = "" )
    for fileName in filesName:
        content = preprocess( fileName )
        ppw = perplexity( content, lm, func = "Interplotation", lambdas = lambdas )
        dfp[fileName] = ppw
    fps = sorted( dfp.items(), key = lambda x: x[1], reverse = True )
    with open( savePath + "/" + "filesPerplexity.txt", 'w' ) as f:
        for fp in fps:
            f.write( fp[0].split( '/' )[-1] + ", " + str( fp[1] ) + "\n" )

In [6]:
# Problem 3.3
"""Load language model

Load language model from folder "lm" and save them into dictionary "lm".

Args:
    loadPath: language model load path.
    encoding: language model files' encoding

Returns:
    lm: a dictionary of language model. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.
"""
def loadLM( loadPath = "./lm", encoding = "utf-8" ):
    lm = {}
    ngram = ["unigram", "bigram", "trigram"]
    n = 0
    # load unigram, bigram, and trigram
    for name in ngram:
        n += 1
        with open( loadPath + "/" + name, "r", encoding = encoding ) as f:
            ngram = {}
            total = 0
            line = f.readline()
            while line:
                kv = line.split( ' ' )
                if len( kv ) > 1:
                    kv[0] = line[:n]
                    kv[1] = line[n + 1:]
                    ngram[kv[0]] = int( kv[1] )
                else:
                    total = int( kv[0] )
                line = f.readline()
            lm[name] = {"c": ngram, "t": total}
    return lm

"""Add lambda smoothing

P(w_{n}|w_{n-1}w_{n-2}) = ( c(w_{n-1}w_{n-2}, w_{n}) + lambda ) /
                            ( c(w_{n-1}w_{n-1}) + lambda * V )

Args:
    lm: a dictionary of language model. Its structure is:
    
    {"unigram": {"c": unigram, "t": total unigram characters},
     "bigram" : {"c": bigram,  "t": total bigram  characters},
     "trigram": {"c": trigram, "t": total trigram characters}}.
    
    lambdas: a generator of lambdas which generate a dictionary of lambdas
             for unigram, bigram, trigram each time. For example:
    
    {1:0.1, 2:0.1, 3:0.8}
    
    s: string wating for calculating unigram, bigram, and trigram.

Returns:
    p: a double number represents the final probability of P(w_{n}|w_{n-2}w_{n-1}).
"""
def addLambda( lm, lambdas, s ):
    if s not in lm["trigram"]["c"]:
        cnt1 = 0
    else:
        cnt1 = lm["trigram"]["c"][s]
    if s[:2] not in lm["bigram"]["c"]:
        cnt2 = 0
    else:
        cnt2 = lm["bigram"]["c"][s[:2]]
    p = ( cnt1 + lambdas[3] ) / ( cnt2 + len( lm["bigram"]["c"] ) * lambdas[3] )
    return p

"""Main function for problem 3.3

Calculate the perplexity for each le in the test set using linear interpolation smoothing
method.

Args:
    trainDataPath: train data path.
    encoding: train data files' encoding
    savePath: path to save language model. If it equals to empty string, the function returns
              language model.
    testDataPath: test data path.
    ratio: the proportion of the real training set comparing to whole training set.

Returns:
    None.
"""
def addLambdaPPW( lmPath = "./lm", encoding = "Latin-1", savePath = "./3_3",
                  testDataPath = "./test_data" ):
    # Get new language model
    lm = loadLM( lmPath, encoding )
    lambdas = {3: 0.1}
    # File-PPW pair dictionary
    dfp = {}
    filesName, _ = getFilesName( testDataPath, suffix = "" )
    for fileName in filesName:
        content = preprocess( fileName )
        ppw = perplexity( content, lm, func = "AddLambda", lambdas = lambdas )
        dfp[fileName] = ppw
    fps = sorted( dfp.items(), key = lambda x: x[1], reverse = True )
    with open( savePath + "/" + "filesPerplexity.txt", 'w' ) as f:
        for fp in fps:
            f.write( fp[0].split( '/' )[-1] + ", " + str( fp[1] ) + "\n" )

In [7]:
def assignment1_3():
    # problem 3.1
    print( "Running problem 3.1" )
    buildLM( trainDataPath = "./gutenberg", encoding = "Latin-1", savePath = "./lm",
             ratio = 1 )
    # problem 3.2
    print( "Running problem 3.2" )
    interplotationPPW( trainDataPath = "./gutenberg", encoding = "Latin-1",
                       savePath = "./3_2", testDataPath = "./test_data", ratio = 0.8 )
    # problem 3.3
    print( "Running problem 3.3" )
    addLambdaPPW( lmPath = "./lm", encoding = "Latin-1", savePath = "./3_3",
                  testDataPath = "./test_data" )

In [8]:
assignment1_3()

Running problem 3.1
Retrieving files name from folder ./gutenberg...
Building language modeling...
Counting for finding UNK.
Preprocessing file ./gutenberg/austen-emma.txt...
Preprocessing file ./gutenberg/austen-persuasion.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bible-kjv.txt...
Preprocessing file ./gutenberg/blake-poems.txt...
Preprocessing file ./gutenberg/bryant-stories.txt...
Preprocessing file ./gutenberg/burgess-busterbrown.txt...
Preprocessing file ./gutenberg/carroll-alice.txt...
Preprocessing file ./gutenberg/chesterton-ball.txt...
Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/chesterton-thursday.txt...
Preprocessing file ./gutenberg/edgeworth-parents.txt...
Preprocessing file ./gutenberg/melville-moby_dick.txt...
Preprocessing file ./gutenberg/milton-paradise.txt...
Preprocessing file ./gutenberg/shakespeare-caesar.txt...
Preprocessing file ./gutenberg/shakespeare-hamlet.txt...
Preproces

Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bryant-stories.txt...
Preprocessing file ./gutenberg/carroll-alice.txt...
Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bryant-stories.txt...
Preprocessing file ./gutenberg/carroll-alice.txt...
Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bryant-stories.txt...
Preprocessing file ./gutenberg/carroll-alice.txt...
Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bryant-stories.txt...
Preprocessing file ./gutenberg/carroll-alice.txt...
Preprocessing file ./gutenberg/chesterton-brown.txt...
Preprocessing file ./gutenberg/austen-sense.txt...
Preprocessing file ./gutenberg/bryant-stories.txt.

Preprocessing file ./test_data/1362...
Preprocessing file ./test_data/1370...
Preprocessing file ./test_data/1377...
Preprocessing file ./test_data/1380...
Preprocessing file ./test_data/1386...
Preprocessing file ./test_data/1390...
Preprocessing file ./test_data/1405...
Preprocessing file ./test_data/1406...
Preprocessing file ./test_data/1417...
Preprocessing file ./test_data/1437...
Preprocessing file ./test_data/1448...
Preprocessing file ./test_data/1454...
Preprocessing file ./test_data/1481...
Preprocessing file ./test_data/1499...
Preprocessing file ./test_data/1504...
Preprocessing file ./test_data/1505...
Preprocessing file ./test_data/1515...
Preprocessing file ./test_data/1518...
Preprocessing file ./test_data/1555...
Preprocessing file ./test_data/1557...
Preprocessing file ./test_data/1564...
Preprocessing file ./test_data/1566...
Preprocessing file ./test_data/1573...
Preprocessing file ./test_data/1575...
Preprocessing file ./test_data/1605...
Preprocessing file ./test

Preprocessing file ./test_data/3475...
Preprocessing file ./test_data/3492...
Preprocessing file ./test_data/3493...
Preprocessing file ./test_data/3517...
Preprocessing file ./test_data/3539...
Preprocessing file ./test_data/3541...
Preprocessing file ./test_data/3554...
Preprocessing file ./test_data/3560...
Preprocessing file ./test_data/3564...
Preprocessing file ./test_data/3575...
Preprocessing file ./test_data/3582...
Preprocessing file ./test_data/3623...
Preprocessing file ./test_data/3660...
Preprocessing file ./test_data/3668...
Preprocessing file ./test_data/3673...
Preprocessing file ./test_data/3687...
Preprocessing file ./test_data/3689...
Preprocessing file ./test_data/3709...
Preprocessing file ./test_data/3727...
Preprocessing file ./test_data/3734...
Preprocessing file ./test_data/3765...
Preprocessing file ./test_data/3773...
Preprocessing file ./test_data/3784...
Preprocessing file ./test_data/3794...
Preprocessing file ./test_data/3804...
Preprocessing file ./test

Preprocessing file ./test_data/5675...
Preprocessing file ./test_data/5689...
Preprocessing file ./test_data/5704...
Preprocessing file ./test_data/5706...
Preprocessing file ./test_data/5716...
Preprocessing file ./test_data/5723...
Preprocessing file ./test_data/5725...
Preprocessing file ./test_data/5735...
Preprocessing file ./test_data/5740...
Preprocessing file ./test_data/5747...
Preprocessing file ./test_data/5750...
Preprocessing file ./test_data/5753...
Preprocessing file ./test_data/5773...
Preprocessing file ./test_data/5782...
Preprocessing file ./test_data/5813...
Preprocessing file ./test_data/5822...
Preprocessing file ./test_data/5824...
Preprocessing file ./test_data/5828...
Preprocessing file ./test_data/5833...
Preprocessing file ./test_data/5840...
Preprocessing file ./test_data/5847...
Preprocessing file ./test_data/5860...
Preprocessing file ./test_data/5877...
Preprocessing file ./test_data/5886...
Preprocessing file ./test_data/5901...
Preprocessing file ./test

Preprocessing file ./test_data/7603...
Preprocessing file ./test_data/7611...
Preprocessing file ./test_data/7613...
Preprocessing file ./test_data/7670...
Preprocessing file ./test_data/7693...
Preprocessing file ./test_data/7699...
Preprocessing file ./test_data/7718...
Preprocessing file ./test_data/7737...
Preprocessing file ./test_data/7757...
Preprocessing file ./test_data/7785...
Preprocessing file ./test_data/7793...
Preprocessing file ./test_data/7810...
Preprocessing file ./test_data/7821...
Preprocessing file ./test_data/7848...
Preprocessing file ./test_data/7872...
Preprocessing file ./test_data/7873...
Preprocessing file ./test_data/7879...
Preprocessing file ./test_data/7880...
Preprocessing file ./test_data/7889...
Preprocessing file ./test_data/7921...
Preprocessing file ./test_data/7929...
Preprocessing file ./test_data/7932...
Preprocessing file ./test_data/7938...
Preprocessing file ./test_data/7940...
Preprocessing file ./test_data/7950...
Preprocessing file ./test

Preprocessing file ./test_data/9813...
Preprocessing file ./test_data/9820...
Preprocessing file ./test_data/9823...
Preprocessing file ./test_data/9827...
Preprocessing file ./test_data/9831...
Preprocessing file ./test_data/9854...
Preprocessing file ./test_data/9856...
Preprocessing file ./test_data/9860...
Preprocessing file ./test_data/9864...
Preprocessing file ./test_data/9874...
Preprocessing file ./test_data/9881...
Preprocessing file ./test_data/9894...
Preprocessing file ./test_data/9897...
Preprocessing file ./test_data/9905...
Preprocessing file ./test_data/9915...
Preprocessing file ./test_data/9916...
Preprocessing file ./test_data/9925...
Preprocessing file ./test_data/9926...
Preprocessing file ./test_data/9932...
Preprocessing file ./test_data/9939...
Preprocessing file ./test_data/9940...
Preprocessing file ./test_data/9952...
Preprocessing file ./test_data/9959...
Preprocessing file ./test_data/9965...
Preprocessing file ./test_data/9973...
Preprocessing file ./test

Preprocessing file ./test_data/1893...
Preprocessing file ./test_data/1895...
Preprocessing file ./test_data/1898...
Preprocessing file ./test_data/1902...
Preprocessing file ./test_data/1912...
Preprocessing file ./test_data/1915...
Preprocessing file ./test_data/1927...
Preprocessing file ./test_data/1931...
Preprocessing file ./test_data/1944...
Preprocessing file ./test_data/1949...
Preprocessing file ./test_data/1958...
Preprocessing file ./test_data/1967...
Preprocessing file ./test_data/1970...
Preprocessing file ./test_data/1978...
Preprocessing file ./test_data/1994...
Preprocessing file ./test_data/2006...
Preprocessing file ./test_data/2013...
Preprocessing file ./test_data/2057...
Preprocessing file ./test_data/2065...
Preprocessing file ./test_data/2074...
Preprocessing file ./test_data/2093...
Preprocessing file ./test_data/2097...
Preprocessing file ./test_data/2149...
Preprocessing file ./test_data/2151...
Preprocessing file ./test_data/2153...
Preprocessing file ./test

Preprocessing file ./test_data/4061...
Preprocessing file ./test_data/4073...
Preprocessing file ./test_data/4079...
Preprocessing file ./test_data/4095...
Preprocessing file ./test_data/4102...
Preprocessing file ./test_data/4111...
Preprocessing file ./test_data/4125...
Preprocessing file ./test_data/4128...
Preprocessing file ./test_data/4133...
Preprocessing file ./test_data/4138...
Preprocessing file ./test_data/4145...
Preprocessing file ./test_data/4151...
Preprocessing file ./test_data/4165...
Preprocessing file ./test_data/4169...
Preprocessing file ./test_data/4182...
Preprocessing file ./test_data/4186...
Preprocessing file ./test_data/4199...
Preprocessing file ./test_data/4208...
Preprocessing file ./test_data/4216...
Preprocessing file ./test_data/4217...
Preprocessing file ./test_data/4223...
Preprocessing file ./test_data/4243...
Preprocessing file ./test_data/4254...
Preprocessing file ./test_data/4262...
Preprocessing file ./test_data/4264...
Preprocessing file ./test

Preprocessing file ./test_data/6117...
Preprocessing file ./test_data/6118...
Preprocessing file ./test_data/6139...
Preprocessing file ./test_data/6140...
Preprocessing file ./test_data/6157...
Preprocessing file ./test_data/6160...
Preprocessing file ./test_data/6172...
Preprocessing file ./test_data/6191...
Preprocessing file ./test_data/6193...
Preprocessing file ./test_data/6197...
Preprocessing file ./test_data/6214...
Preprocessing file ./test_data/6243...
Preprocessing file ./test_data/6252...
Preprocessing file ./test_data/6260...
Preprocessing file ./test_data/6264...
Preprocessing file ./test_data/6275...
Preprocessing file ./test_data/6292...
Preprocessing file ./test_data/6297...
Preprocessing file ./test_data/6298...
Preprocessing file ./test_data/6341...
Preprocessing file ./test_data/6348...
Preprocessing file ./test_data/6352...
Preprocessing file ./test_data/6358...
Preprocessing file ./test_data/6363...
Preprocessing file ./test_data/6370...
Preprocessing file ./test

Preprocessing file ./test_data/8250...
Preprocessing file ./test_data/8251...
Preprocessing file ./test_data/8255...
Preprocessing file ./test_data/8275...
Preprocessing file ./test_data/8277...
Preprocessing file ./test_data/8288...
Preprocessing file ./test_data/8301...
Preprocessing file ./test_data/8332...
Preprocessing file ./test_data/8335...
Preprocessing file ./test_data/8339...
Preprocessing file ./test_data/8341...
Preprocessing file ./test_data/8344...
Preprocessing file ./test_data/8352...
Preprocessing file ./test_data/8360...
Preprocessing file ./test_data/8375...
Preprocessing file ./test_data/8380...
Preprocessing file ./test_data/8388...
Preprocessing file ./test_data/8415...
Preprocessing file ./test_data/8430...
Preprocessing file ./test_data/8431...
Preprocessing file ./test_data/8437...
Preprocessing file ./test_data/8439...
Preprocessing file ./test_data/8445...
Preprocessing file ./test_data/8447...
Preprocessing file ./test_data/8450...
Preprocessing file ./test