In [1]:
import os
import sys
import random
import re
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_validate
import gensim
import nltk
import numpy as np
from collections import Counter

In [2]:
# To speed up the whole program, I read word2vec model at the beginning.
w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
                "Data/GoogleNews-vectors-negative300.bin", binary = True )

In [3]:
# Data preprocessing related functions.
"""Get all files name under path

Args:
    path: folder path to retrieve files' name.
    suffix: type of files
    shuffle: a boolean value. TRUE: shuffle list; False: order list.

Returns:
    filesName: a list of all files end with suffix. For example:

    ["dir/a.txt", "dir/b.txt"].
"""
def getFilesName( path, suffix = ".txt", shuffle = False ):
    print( "Retrieving files name from folder %s..." % ( path ) )
    filesName = []
    files = os.listdir( path )
    for file in files:
        if os.path.splitext( file )[1] == suffix:
            name = ''.join( [path, file] )
            filesName.append( name )
    if shuffle:
        random.shuffle( filesName )
    else:
        filesName.sort()
    return filesName

"""Preprocess data

1. Remove blank lines from each file.
2. Replace newline characters with spaces.
3. Remove duplicate spaces.

Args:
    fileName: fileName indicating which file that need to be processed.
    encoding: the encoding of inputing file. Default value is "Latin-1".

Returns:
    content: a string containing processed content from file. For example:
    
    "A cat"
"""
def preprocess( fileName, encoding = "Latin-1" ):
    print( "Preprocessing file %s..." % ( fileName ) )
    content = ""
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        while line:
            line = re.sub( "\s", " ", line )
            content += line
            line = f.readline()
        content = re.sub( "\s+", " ", content ).strip()
    return content

"""Remove puntuations in sentences

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}

Returns:
    sentences: a dictionary contains labeled sentences without puctuations.
               For example:
    
               {"sentence 1": 1, "sentence 2": -1, ...}
"""
def removePuctuation( contents, encoding = "Latin-1" ):
    sentences = {}
    for ( k, v ) in contents.items():
        k = re.sub( "_", "", k )
        k = re.sub( "[^\w\s]", "", k )
        sentences[k] = v
    return sentences

"""Get data from files

Args:
    filesName: a list of all files end with suffix.
    encoding: a string represents the encoding of files.
    
Returns:
    contents: a list of string contains all contents in files.
"""
def getData( filesName, encoding = "Latin-1" ):
    contents = []
    for fileName in filesName:
        content = preprocess( fileName, encoding = encoding )
        contents.append( content )
    return contents

In [4]:
# Data vectorization related functions.
"""Generate vectorized data from sentences by counting

In this function, here are two ways to vectorize sentences.

CountVectorizer: convert a collection of text documents to a matrix
                 of token counts.
TfidfVectorizer: convert a collection of raw documents to a matrix
                 of TF-IDF features.

By training on data, the latter one shows better accuracy and is more
stable than former one. Thus, TfidfVectorizer is used.

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.
    
    encoding: the string encoding of content. Default value is Latin-1.

Returns:
    data: a 2-D list contains vectorized sentences and label.
                  For examle:
                  
                  [[vecSentence 1, vecSentence 2],
                   [label 1, label 2]].
    
    vectorizer: a TfidfVectorizer with tunned parameters.
"""
def vectorizeData( contents, encoding = "Latin-1" ):
    contents = removePuctuation( contents )
    vectorizer = TfidfVectorizer( encoding = encoding, min_df = 3,
                                  stop_words = "english" )
    contentsVector = vectorizer.fit_transform( list( contents.keys() ) )
    data = [contentsVector.toarray(), list( contents.values() )]
    return data, vectorizer

"""Generate vectorized data from sentences by word2vec

Calculate the average word embeddings for sentences.

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}
    
    encoding: the string encoding of content. Default value is Latin-1.

Returns:
    data: a 2-D list contains vectorized sentences and label.
                  For examle:
                  
                  [[vecSentence 1, vecSentence 2],
                   [label 1, label 2]].
"""
def word2vecData( contents, encoding = "Latin-1" ):
#     I put the code in the front of all code in order to speed up the
#     whole program.
#     w2vModel = gensim.models.KeyedVectors.load_word2vec_format(
#                 "Data/GoogleNews-vectors-negative300.bin", binary = True )
    w2vDict = w2vModel.vocab.keys()
    contents = removePuctuation( contents )
    sentences = list( contents.keys() )
    vsentences = []
    for sentence in sentences:
        words = sentence.split()
        vwords = []
        for word in words:
            if word in w2vDict:
                vwords.append( w2vModel[word] )
        vwords = np.mean( vwords, axis = 0 )
        vsentences.append( vwords )
    data = [np.array( vsentences ), list( contents.values() )]
    return data

"""Generate vectorized data from sentences by pos-tagging

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}
    
    encoding: the string encoding of content. Default value is Latin-1.

Returns:
    data: a 2-D list contains vectorized sentences and label.
                  For examle:
                  
                  [[vecSentence 1, vecSentence 2],
                   [label 1, label 2]].
"""
def posTagData( contents, encoding = "Latin-1" ):
    sentences = list( contents.keys() )
    vPosTags = []
    allPosTags = ["CC", "CD", "DT", "EX", "FW", "IN", "JJ", "JJR", "JJS",
                  "LS", "MD", "NN", "NNS", "NNP", "NNPS", "PDT", "POS",
                  "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SYM", "TO",
                  "UH", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ", "WDT",
                  "WP", "WP$", "WRB"]
    allPosTags = {allPosTags[i]: i for i in range( len( allPosTags ) )}
    for sentence in sentences:
        posTag = nltk.pos_tag( nltk.word_tokenize( sentence ) )
        vPosTag = np.zeros( ( len( allPosTags ), ) )
        for pt in posTag:
            if pt[1] in allPosTags:
                vPosTag[allPosTags[pt[1]]] += 1
        vPosTag = vPosTag / np.sum( vPosTag )
#         vPosTag = np.tanh( ( ( vPosTag - vPosTag.mean() ) / vPosTag.std() ) )
#         vPosTag = ( vPosTag - np.min( vPosTag ) ) / \
#                     ( np.max( vPosTag ) - np.min( vPosTag ) )
        vPosTags.append( vPosTag )
    data = [np.array( vPosTags ), list( contents.values() )]
    return data

In [5]:
def gridSearch( data, noOfFirstHiddenLayer, maxIter, learningRateInit ):
    nf = noOfFirstHiddenLayer
    mi = maxIter
    lr = learningRateInit
    params = [ [x, y, 10 ** z]
                    for x in range( nf[0], nf[1] + nf[2], nf[2] )
                    for y in range( mi[0], mi[1] + mi[2], mi[2] )
                    for z in range( int( np.log10( lr[0] ) ),
                                    int( np.log10( lr[1] ) ) + lr[2],
                                    lr[2] ) ]
    scores = []
    mlps = []
    for param in params:
        print( param )
        mlp = MLPClassifier( ( param[0], 10 ), max_iter = param[1],
                             learning_rate_init = param[2] )
        score = cross_validate( mlp, data[0], data[1], cv = 10,
                                scoring = "accuracy" )
        print( score )
        scores.append( np.array( score["test_score"] ).mean() )
        mlps.append( mlp )
    index = np.argmax( scores )
    print( "Grid search result:" )
    print( params[index] )
    print( scores[index] )
    return mlps[index]

In [6]:
"""Main part of problem 3.1

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.

Returns:
    None.
"""
def problem3_1( contents ):
    print( "Problem 3.1:" )
    print( "Training..." )
    
    # Get word distribution.
    data, _ = vectorizeData( contents )
    mlp = gridSearch( data, [370, 390, 10], [200, 600, 200], [0.001, 0.1, 1] )
    print( "Problem 3.1 best parameters:" )
    print( mlp.get_params() )
    return mlp

In [7]:
"""Main part of problem 3.2

In this problem, word distribution is used to train the model.

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.

Returns:
    mlp: a trained MLPClassifier with tunned parameters.
"""
def problem3_2( contents, params ):
    print( "Problem 3.2:" )
    print( "Training..." )
    
    # Get word distribution.
    data, _ = vectorizeData( contents )
    
    mlp = MLPClassifier()
    mlp.set_params( **params )
    mlp.fit( data[0], data[1] )
    score = mlp.score( data[0], data[1] )
    print( "problem 3.2 accuracy:" )
    print( score )
    return mlp

"""Predict result based on model in problem 3.2

Args:
    content: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.
    
    mlp: a trained MLPClassifier with tunned parameters.
    vectorizer: a TfidfVectorizer with tunned parameters.

Returns:
    res: a list of predicted result with the same order as input.
"""
def predict3_2( contents, mlp, vectorizer ):
    print( "Problem 3.2:" )
    print( "Predicting..." )
    data = vectorizer.transform( contents.keys() )
    res = mlp.predict( data ).tolist()
    return res

In [8]:
"""Main part of problem 3.3

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.

Returns:
    mlp: a trained MLPClassifier with tunned parameters.
"""
def problem3_3( contents ):
    print( "Problem 3.3:" )
    print( "Training..." )
    data = word2vecData( contents )
    
    mlp = gridSearch( data, [50, 65, 3], [200, 600, 200], [0.001, 0.1, 1] )
    mlp.fit( data[0], data[1] )
    score = mlp.score( data[0], data[1] )
    print( "Problem 3.3 accuracy:" )
    print( score )
    print( "Problem 3.3 best parameters:" )
    print( mlp.get_params() )
    return mlp

"""Predict result based on model in problem 3.3

Args:
    content: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.
    
    mlp: a trained MLPClassifier with tunned parameters.

Returns:
    res: a list of predicted result with the same order as input.
"""
def predict3_3( contents, mlp ):
    print( "Problem 3.3:" )
    print( "Predicting..." )
    data = word2vecData( contents )
    res = mlp.predict( data[0] ).tolist()
    return res

In [9]:
"""Main part of problem 3.4

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.

Returns:
    mlp: a trained MLPClassifier with tunned parameters.
"""
def problem3_4( contents, params ):
    print( "Problem 3.4:" )
    print( "Training..." )
    data1 = word2vecData( contents )
    data2 = posTagData( contents )
    data = [np.concatenate( ( data1[0], data2[0] ),
                                    axis = 1 ),
                    data1[1]]
    mlp = MLPClassifier()
    mlp.set_params( **params )
    mlp.fit( data[0], data[1] )
    score = mlp.score( data[0], data[1] )
    print( "Problem 3.4 accuracy:" )
    print( score )
    return mlp

"""Predict result based on model in problem 3.4

Args:
    content: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.
    
    mlp: a trained MLPClassifier with tunned parameters.

Returns:
    res: a list of predicted result with the same order as input.
"""
def predict3_4( contents, mlp ):
    print( "Problem 3.4:" )
    print( "Predicting..." )
    data1 = word2vecData( contents )
    data2 = posTagData( contents )
    data = np.concatenate( ( data1[0], data2[0] ), axis = 1 )
    res = mlp.predict( data ).tolist()
    return res

In [10]:
"""Main part of problem 3.5

Predict classification by model in problem 3.2, 3.3, 3.4 and compare
them. Finally, it choose the model from problem 3.2 to predict final
result and write in files.

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}.
    
    mlp: a trained MLPClassifier with tunned parameters.
    vectorizer: a TfidfVectorizer with tunned parameters.
    testFilesName: a list of string contains all test files name.

Returns:
    None.
"""
def problem3_5( contents, mlps, vectorizer, testFilesName ):
    print( "Predict result on model 3.2:" )
    pred2 = predict3_2( contents, mlps[0], vectorizer )
    print( len( [x for x in pred2 if x == 1] ) / len( pred2 ) )
    print( "Predict result on model 3.3:" )
    pred3 = predict3_3( contents, mlps[1] )
    print( len( [x for x in pred3 if x == 1] ) / len( pred3 ) )
    print( "Predict result on model 3.4:" )
    pred4 = predict3_4( contents, mlps[2] )
    print( len( [x for x in pred4 if x == 1] ) / len( pred4 ) )
    
    # Select model 3.2 due to its performance on training set
    pred = pred2
    
    # Save to files
    savePath = "Data/pred/"
    if not os.path.isdir( savePath ):
        os.makedirs( savePath )
    fpos = open( savePath + "pos.txt", "w", encoding = "UTF-8" )
    fneg = open( savePath + "neg.txt", "w", encoding = "UTF-8" )
    for file, p in zip( testFilesName, pred ):
        if p == 1:
            fpos.write( file + "\n" )
        else:
            fneg.write( file + "\n" )
    fpos.close()
    fneg.close()

In [11]:
"""Main function of the problem 3

Args:
    None.

Returns:
    None.
"""
def main():
    # Get train data from files.
    posFilesName = getFilesName( "Data/train/pos/" )
    negFilesName = getFilesName( "Data/train/neg/" )
    posContents  = getData( posFilesName )
    negContents  = getData( negFilesName )
    contents = posContents + negContents
    contents  = dict( zip( contents, [1] * len( posContents ) +
                              [-1] * len( negContents ) ) )
    _, vectorizer = vectorizeData( contents )
    
    # Problem 3.1, 3.2, 3.3, 3.4
    mlp1 = problem3_1( contents )
    mlp2 = problem3_2( contents, mlp1.get_params() )
    mlp3 = problem3_3( contents )
    mlp4 = problem3_4( contents, mlp3.get_params() )
    mlps = [mlp2, mlp3, mlp4]
    
    # Get test data from files.
    testFilesName = getFilesName( "Data/test/" )
    contents = getData( testFilesName )
    contents = dict( zip( contents, [0] * len( contents ) ) )
    
    # Problem 3.5
    pred = problem3_5( contents, mlps, vectorizer, testFilesName )

In [12]:
if __name__ == "__main__":
    main()

Retrieving files name from folder Data/train/pos/...
Retrieving files name from folder Data/train/neg/...
Preprocessing file Data/train/pos/cv001_tok-10180.txt...
Preprocessing file Data/train/pos/cv002_tok-12931.txt...
Preprocessing file Data/train/pos/cv003_tok-8338.txt...
Preprocessing file Data/train/pos/cv004_tok-29856.txt...
Preprocessing file Data/train/pos/cv005_tok-26110.txt...
Preprocessing file Data/train/pos/cv007_tok-14417.txt...
Preprocessing file Data/train/pos/cv009_tok-6385.txt...
Preprocessing file Data/train/pos/cv010_tok-29740.txt...
Preprocessing file Data/train/pos/cv011_tok-9168.txt...
Preprocessing file Data/train/pos/cv012_tok-13106.txt...
Preprocessing file Data/train/pos/cv013_tok-13924.txt...
Preprocessing file Data/train/pos/cv014_tok-16534.txt...
Preprocessing file Data/train/pos/cv015_tok-13846.txt...
Preprocessing file Data/train/pos/cv016_tok-9867.txt...
Preprocessing file Data/train/pos/cv017_tok-29801.txt...
Preprocessing file Data/train/pos/cv018_tok

Preprocessing file Data/train/pos/cv285_tok-13000.txt...
Preprocessing file Data/train/pos/cv286_tok-9530.txt...
Preprocessing file Data/train/pos/cv287_tok-14252.txt...
Preprocessing file Data/train/pos/cv288_tok-10926.txt...
Preprocessing file Data/train/pos/cv290_tok-25396.txt...
Preprocessing file Data/train/pos/cv291_tok-20114.txt...
Preprocessing file Data/train/pos/cv293_tok-5883.txt...
Preprocessing file Data/train/pos/cv294_tok-12972.txt...
Preprocessing file Data/train/pos/cv295_tok-13521.txt...
Preprocessing file Data/train/pos/cv296_tok-11353.txt...
Preprocessing file Data/train/pos/cv297_tok-13394.txt...
Preprocessing file Data/train/pos/cv298_tok-16139.txt...
Preprocessing file Data/train/pos/cv299_tok-19259.txt...
Preprocessing file Data/train/pos/cv301_tok-18134.txt...
Preprocessing file Data/train/pos/cv302_tok-29013.txt...
Preprocessing file Data/train/pos/cv303_tok-18517.txt...
Preprocessing file Data/train/pos/cv304_tok-10308.txt...
Preprocessing file Data/train/pos

Preprocessing file Data/train/pos/cv574_tok-28835.txt...
Preprocessing file Data/train/pos/cv576_tok-12064.txt...
Preprocessing file Data/train/pos/cv577_tok-9749.txt...
Preprocessing file Data/train/pos/cv578_tok-5046.txt...
Preprocessing file Data/train/pos/cv579_tok-7082.txt...
Preprocessing file Data/train/pos/cv580_tok-6559.txt...
Preprocessing file Data/train/pos/cv581_tok-20785.txt...
Preprocessing file Data/train/pos/cv583_tok-19290.txt...
Preprocessing file Data/train/pos/cv585_tok-29721.txt...
Preprocessing file Data/train/pos/cv586_tok-20318.txt...
Preprocessing file Data/train/pos/cv587_tok-4670.txt...
Preprocessing file Data/train/pos/cv588_tok-10492.txt...
Preprocessing file Data/train/pos/cv589_tok-17484.txt...
Preprocessing file Data/train/pos/cv591_tok-12721.txt...
Preprocessing file Data/train/pos/cv592_tok-21878.txt...
Preprocessing file Data/train/pos/cv593_tok-12180.txt...
Preprocessing file Data/train/pos/cv594_tok-11418.txt...
Preprocessing file Data/train/pos/cv

Preprocessing file Data/train/neg/cv167_tok-27407.txt...
Preprocessing file Data/train/neg/cv168_tok-4250.txt...
Preprocessing file Data/train/neg/cv169_tok-10229.txt...
Preprocessing file Data/train/neg/cv171_tok-12766.txt...
Preprocessing file Data/train/neg/cv172_tok-9076.txt...
Preprocessing file Data/train/neg/cv173_tok-11316.txt...
Preprocessing file Data/train/neg/cv174_tok-19826.txt...
Preprocessing file Data/train/neg/cv175_tok-18960.txt...
Preprocessing file Data/train/neg/cv176_tok-15880.txt...
Preprocessing file Data/train/neg/cv177_tok-29625.txt...
Preprocessing file Data/train/neg/cv178_tok-18689.txt...
Preprocessing file Data/train/neg/cv179_tok-14006.txt...
Preprocessing file Data/train/neg/cv180_tok-29281.txt...
Preprocessing file Data/train/neg/cv181_tok-29326.txt...
Preprocessing file Data/train/neg/cv183_tok-18186.txt...
Preprocessing file Data/train/neg/cv184_tok-20654.txt...
Preprocessing file Data/train/neg/cv185_tok-22786.txt...
Preprocessing file Data/train/neg

Preprocessing file Data/train/neg/cv479_tok-23674.txt...
Preprocessing file Data/train/neg/cv480_tok-15230.txt...
Preprocessing file Data/train/neg/cv481_tok-10977.txt...
Preprocessing file Data/train/neg/cv482_tok-7136.txt...
Preprocessing file Data/train/neg/cv483_tok-11054.txt...
Preprocessing file Data/train/neg/cv484_tok-25274.txt...
Preprocessing file Data/train/neg/cv486_tok-25043.txt...
Preprocessing file Data/train/neg/cv487_tok-11827.txt...
Preprocessing file Data/train/neg/cv488_tok-13847.txt...
Preprocessing file Data/train/neg/cv489_tok-4226.txt...
Preprocessing file Data/train/neg/cv490_tok-4442.txt...
Preprocessing file Data/train/neg/cv491_tok-29357.txt...
Preprocessing file Data/train/neg/cv492_tok-24854.txt...
Preprocessing file Data/train/neg/cv494_tok-11693.txt...
Preprocessing file Data/train/neg/cv495_tok-18551.txt...
Preprocessing file Data/train/neg/cv496_tok-17126.txt...
Preprocessing file Data/train/neg/cv499_tok-6649.txt...
Preprocessing file Data/train/neg/c

Problem 3.1:
Training...
[370, 200, 0.001]
{'fit_time': array([68.99185348, 73.29942989, 86.33474493, 74.02608085, 71.59301114,
       61.85407114, 76.36049485, 75.47003412, 71.96917701, 75.9751513 ]), 'score_time': array([0.03436947, 0.06968951, 0.04265308, 0.04255033, 0.07072711,
       0.04854035, 0.04228663, 0.06649232, 0.04137516, 0.04207683]), 'test_score': array([0.70175439, 0.81578947, 0.83333333, 0.8245614 , 0.76315789,
       0.78761062, 0.77678571, 0.83928571, 0.76785714, 0.83035714]), 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
[370, 200, 0.01]
{'fit_time': array([14.4040103 , 55.46999288, 60.46035361, 61.18811178, 61.02495742,
       14.14787984, 18.63991737, 15.85197926, 16.07105637, 61.73246074]), 'score_time': array([0.0352962 , 0.04225016, 0.04327726, 0.04679751, 0.03512359,
       0.04165268, 0.0336349 , 0.04268837, 0.03368592, 0.04146647]), 'test_score': array([0.75438596, 0.8245614 , 0.8245614 , 0.77192982, 0.78070175,
       0.79646018, 0.803571



{'fit_time': array([ 76.54809976,  68.11126232,  67.07739043,  73.8888905 ,
        81.84683466,  92.9786737 , 429.52104449,  69.88658929,
        88.84013295,  72.34328914]), 'score_time': array([0.02855444, 0.04505682, 0.02849483, 0.04554105, 0.03691721,
       0.04026079, 0.04482436, 0.03914785, 0.03507996, 0.0893867 ]), 'test_score': array([0.70175439, 0.80701754, 0.8245614 , 0.8245614 , 0.77192982,
       0.80530973, 0.74107143, 0.86607143, 0.75892857, 0.84821429]), 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
[380, 200, 0.01]
{'fit_time': array([16.71775627, 17.55177379, 57.78228593, 58.66104054, 17.07541704,
       17.3507967 , 53.93849063, 61.11691403, 16.88284397, 17.16318083]), 'score_time': array([0.04540753, 0.03999591, 0.07100534, 0.03726339, 0.06357408,
       0.0453403 , 0.02894449, 0.09597063, 0.02607203, 0.07580853]), 'test_score': array([0.71929825, 0.79824561, 0.8245614 , 0.79824561, 0.72807018,
       0.79646018, 0.8125    , 0.85714286, 0.80357143

{'fit_time': array([73.06801534, 80.42555976, 87.94766665, 68.78902173, 75.65273571,
       62.19180989, 71.87356544, 68.95102024, 78.8070941 , 81.32789373]), 'score_time': array([0.05322123, 0.03814507, 0.06298423, 0.02972341, 0.0535934 ,
       0.03901267, 0.03806138, 0.03789806, 0.03880072, 0.03876686]), 'test_score': array([0.70175439, 0.81578947, 0.84210526, 0.83333333, 0.78070175,
       0.79646018, 0.77678571, 0.86607143, 0.78571429, 0.83928571]), 'train_score': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])}
[390, 600, 0.01]
{'fit_time': array([17.90034771, 17.79113555, 65.87222862, 17.70132089, 17.71899486,
       15.59794688, 66.12052298, 17.61051702, 15.17219472, 17.88937497]), 'score_time': array([0.03762054, 0.02989674, 0.05605626, 0.05395174, 0.0920167 ,
       0.04740191, 0.04652858, 0.03195262, 0.05170918, 0.0377183 ]), 'test_score': array([0.76315789, 0.79824561, 0.79824561, 0.79824561, 0.78070175,
       0.80530973, 0.79464286, 0.875     , 0.75892857, 0.78571429]), '

{'fit_time': array([0.26423788, 0.50380993, 0.3533144 , 0.26896644, 0.23104739,
       0.17920399, 0.19542289, 0.36770201, 0.26642942, 0.35613322]), 'score_time': array([0.00056386, 0.00079775, 0.00057006, 0.00075865, 0.00072002,
       0.00093246, 0.00084162, 0.00080824, 0.00054479, 0.0008285 ]), 'test_score': array([0.77192982, 0.74561404, 0.85964912, 0.77192982, 0.76315789,
       0.7079646 , 0.72321429, 0.83035714, 0.75892857, 0.80357143]), 'train_score': array([0.81415929, 0.84857424, 0.84267453, 0.83185841, 0.81415929,
       0.76129666, 0.77232581, 0.82531894, 0.77919529, 0.84396467])}
[53, 200, 0.1]
{'fit_time': array([0.17317915, 0.31639624, 0.06615686, 0.32532263, 0.12080789,
       0.24380279, 0.13004351, 0.21613383, 0.09012389, 0.10506463]), 'score_time': array([0.0005796 , 0.00086284, 0.00081325, 0.00097895, 0.00072265,
       0.00056481, 0.00115776, 0.00063896, 0.00057888, 0.00055647]), 'test_score': array([0.85087719, 0.6754386 , 0.5       , 0.74561404, 0.5       ,
     

{'fit_time': array([1.48132849, 1.17749405, 0.88741302, 0.75684667, 1.29411459,
       0.77522564, 0.91146922, 0.87458658, 1.12430668, 0.96980047]), 'score_time': array([0.00056648, 0.00057006, 0.00124288, 0.00076056, 0.00116062,
       0.00075698, 0.00099707, 0.00080705, 0.00057364, 0.00057459]), 'test_score': array([0.85087719, 0.77192982, 0.84210526, 0.74561404, 0.78947368,
       0.73451327, 0.83928571, 0.80357143, 0.78571429, 0.79464286]), 'train_score': array([0.83775811, 0.83185841, 0.81120944, 0.81809243, 0.84267453,
       0.81434185, 0.83709519, 0.82630029, 0.85279686, 0.8498528 ])}
[56, 600, 0.01]
{'fit_time': array([0.20855403, 0.27238131, 0.40159965, 0.37324119, 0.15211749,
       0.23038507, 0.19999862, 0.27460861, 0.27795863, 0.37259269]), 'score_time': array([0.00071597, 0.00086045, 0.00068903, 0.00074506, 0.00069404,
       0.00058079, 0.0011797 , 0.00064039, 0.0011816 , 0.00078535]), 'test_score': array([0.74561404, 0.74561404, 0.88596491, 0.75438596, 0.70175439,
    

{'fit_time': array([0.16292691, 0.30920601, 0.0583806 , 0.04360557, 0.09852052,
       0.42761827, 0.09752941, 0.12991977, 0.08738875, 0.16848683]), 'score_time': array([0.00073862, 0.00087929, 0.00084329, 0.00060511, 0.00061011,
       0.00062728, 0.00076532, 0.00081754, 0.00109577, 0.00070667]), 'test_score': array([0.83333333, 0.76315789, 0.5       , 0.5       , 0.68421053,
       0.77876106, 0.60714286, 0.5       , 0.5       , 0.5       ]), 'train_score': array([0.81809243, 0.77974435, 0.49950836, 0.49950836, 0.67354966,
       0.8388998 , 0.60745829, 0.49950932, 0.49950932, 0.50049068])}
[62, 400, 0.001]
{'fit_time': array([1.09457588, 1.03776026, 1.3479867 , 0.99121428, 0.72011805,
       1.05068779, 1.02535987, 0.773283  , 0.52662039, 0.90827775]), 'score_time': array([0.00125647, 0.00094795, 0.00061584, 0.00064206, 0.00078988,
       0.00059462, 0.00060463, 0.00081825, 0.00079894, 0.00058651]), 'test_score': array([0.83333333, 0.73684211, 0.85087719, 0.70175439, 0.74561404,
   

{'fit_time': array([0.36411023, 0.40138197, 0.24092412, 0.2640295 , 0.30690932,
       0.243325  , 0.49747014, 0.35746241, 0.24391961, 0.04432893]), 'score_time': array([0.0008235 , 0.00059485, 0.00120878, 0.00055099, 0.00110698,
       0.00076985, 0.0007875 , 0.00121593, 0.00067592, 0.00077128]), 'test_score': array([0.8245614 , 0.72807018, 0.85087719, 0.71929825, 0.80701754,
       0.72566372, 0.83035714, 0.8125    , 0.74107143, 0.5       ]), 'train_score': array([0.83775811, 0.77187807, 0.80432645, 0.82300885, 0.85152409,
       0.8064833 , 0.84102061, 0.84887144, 0.77919529, 0.50245339])}
[65, 600, 0.1]
{'fit_time': array([0.08425379, 0.09555197, 0.13286066, 0.08640051, 0.05903721,
       0.32967854, 0.10445094, 0.26952672, 0.10746574, 0.26604533]), 'score_time': array([0.00078607, 0.00079107, 0.00082374, 0.00062084, 0.00077152,
       0.00105929, 0.00083327, 0.00074172, 0.00127912, 0.00062275]), 'test_score': array([0.5       , 0.5       , 0.5       , 0.5       , 0.5       ,
     

Preprocessing file Data/test/cv473_tok-14810.txt...
Preprocessing file Data/test/cv476_tok-12926.txt...
Preprocessing file Data/test/cv484_tok-20319.txt...
Preprocessing file Data/test/cv485_tok-4248.txt...
Preprocessing file Data/test/cv493_tok-10732.txt...
Preprocessing file Data/test/cv493_tok-9478.txt...
Preprocessing file Data/test/cv497_tok-17948.txt...
Preprocessing file Data/test/cv498_tok-12371.txt...
Preprocessing file Data/test/cv498_tok-28077.txt...
Preprocessing file Data/test/cv507_tok-22941.txt...
Preprocessing file Data/test/cv515_tok-19774.txt...
Preprocessing file Data/test/cv518_tok-11610.txt...
Preprocessing file Data/test/cv521_tok-28221.txt...
Preprocessing file Data/test/cv523_tok-14293.txt...
Preprocessing file Data/test/cv524_tok-20616.txt...
Preprocessing file Data/test/cv527_tok-28453.txt...
Preprocessing file Data/test/cv532_tok-11471.txt...
Preprocessing file Data/test/cv536_tok-10580.txt...
Preprocessing file Data/test/cv538_tok-10197.txt...
Preprocessing 