In [64]:
import os
import sys
import random
import re
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_validate
import gensim
from collections import Counter

In [84]:
"""Get all files name under path

Args:
    path: folder path to retrieve files' name.
    suffix: type of files
    shuffle: a boolean value. TRUE: shuffle list; False: order list.

Returns:
    filesName: a list of all files end with suffix. For example:

    ["dir/a.txt", "dir/b.txt"].
"""
def getFilesName( path, suffix = ".txt", shuffle = False ):
    print( "Retrieving files name from folder %s..." % ( path ) )
    filesName = []
    files = os.listdir( path )
    for file in files:
        if os.path.splitext( file )[1] == suffix:
            name = ''.join( [path, file] )
            filesName.append( name )
    if shuffle:
        random.shuffle( filesName )
    else:
        filesName.sort()
    return filesName

"""Preprocess data

1. Remove blank lines from each file.
2. Replace newline characters with spaces.
3. Remove duplicate spaces.

Args:
    fileName: fileName indicating which file that need to be processed.
    encoding: the encoding of inputing file. Default value is "Latin-1".

Returns:
    content: a string containing processed content from file. For example:
    
    "A cat"
"""
def preprocess( fileName, encoding = "Latin-1" ):
    print( "Preprocessing file %s..." % ( fileName ) )
    content = ""
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        while line:
            line = re.sub( "\s", " ", line )
            line = re.sub( "_", "", line )
            line = re.sub( "[^\w\s]", "", line )
            content += line
            line = f.readline()
        content = re.sub( "\s+", " ", content ).strip()
    return content

"""Get training data from files

Args:
    filesName: a list of all files end with suffix.
    encoding: a string represents the encoding of files.
    
Returns:
    contents: a list of string contains all contents in files.
"""
def getTrainingData( filesName, encoding = "Latin-1" ):
    contents = []
    for fileName in filesName:
        content = preprocess( fileName, encoding = encoding )
        contents.append( content )
    return contents


"""Generate training data from sentences

Args:
    contents: a dictionary contains labeled sentences. For example:
    
              {"sentence 1": 1, "sentence 2": -1, ...}
    
    encoding: the string encoding of content. Default value is Latin-1.

Returns:
    trainingData: a 2-D list contains vectorized sentences and label.
                  For examle:
                  
                  [[vecSentence 1, vecSentence 2],
                   [label 1, label 2]]
"""
def generateTrainingData( contents, encoding = "Latin-1" ):
    vectorizer = CountVectorizer( encoding = encoding, min_df = 3, stop_words = "english" )
    contentsVector = vectorizer.fit_transform( list( contents.keys() ) )
    trainingData = [contentsVector, list( contents.values() )]
    return trainingData, vectorizer.get_feature_names()

In [85]:
def problem3_1( trainingData, traingDict ):
    print( "Problem 3.1:" )
    print( "Training..." )
    mlp = MLPClassifier( ( 380, 10 ) )
    score = cross_validate( mlp, trainingData[0], trainingData[1], cv = 10,
                            scoring = "accuracy", return_train_score = True )
    print( score )

In [86]:
def main():
    posFilesName = getFilesName( "Data/train/pos/" )
    negFilesName = getFilesName( "Data/train/neg/" )
    posContents  = getTrainingData( posFilesName )
    negContents  = getTrainingData( negFilesName )
    contents = posContents + negContents
    contents  = dict( zip( contents, [1] * len( posContents ) +
                              [-1] * len( negContents ) ) )
    trainingData, trainingDict = generateTrainingData( contents )
    print( len( trainingDict ) )
    problem3_1( trainingData, trainingDict )

In [None]:
if __name__ == "__main__":
    main()

Retrieving files name from folder Data/train/pos/...
Retrieving files name from folder Data/train/neg/...
Preprocessing file Data/train/pos/cv001_tok-10180.txt...
Preprocessing file Data/train/pos/cv002_tok-12931.txt...
Preprocessing file Data/train/pos/cv003_tok-8338.txt...
Preprocessing file Data/train/pos/cv004_tok-29856.txt...
Preprocessing file Data/train/pos/cv005_tok-26110.txt...
Preprocessing file Data/train/pos/cv007_tok-14417.txt...
Preprocessing file Data/train/pos/cv009_tok-6385.txt...
Preprocessing file Data/train/pos/cv010_tok-29740.txt...
Preprocessing file Data/train/pos/cv011_tok-9168.txt...
Preprocessing file Data/train/pos/cv012_tok-13106.txt...
Preprocessing file Data/train/pos/cv013_tok-13924.txt...
Preprocessing file Data/train/pos/cv014_tok-16534.txt...
Preprocessing file Data/train/pos/cv015_tok-13846.txt...
Preprocessing file Data/train/pos/cv016_tok-9867.txt...
Preprocessing file Data/train/pos/cv017_tok-29801.txt...
Preprocessing file Data/train/pos/cv018_tok

Preprocessing file Data/train/pos/cv270_tok-29235.txt...
Preprocessing file Data/train/pos/cv271_tok-11240.txt...
Preprocessing file Data/train/pos/cv272_tok-13223.txt...
Preprocessing file Data/train/pos/cv273_tok-10626.txt...
Preprocessing file Data/train/pos/cv275_tok-12167.txt...
Preprocessing file Data/train/pos/cv276_tok-24215.txt...
Preprocessing file Data/train/pos/cv277_tok-10654.txt...
Preprocessing file Data/train/pos/cv278_tok-17924.txt...
Preprocessing file Data/train/pos/cv279_tok-15969.txt...
Preprocessing file Data/train/pos/cv280_tok-27724.txt...
Preprocessing file Data/train/pos/cv282_tok-7439.txt...
Preprocessing file Data/train/pos/cv284_tok-10073.txt...
Preprocessing file Data/train/pos/cv285_tok-13000.txt...
Preprocessing file Data/train/pos/cv286_tok-9530.txt...
Preprocessing file Data/train/pos/cv287_tok-14252.txt...
Preprocessing file Data/train/pos/cv288_tok-10926.txt...
Preprocessing file Data/train/pos/cv290_tok-25396.txt...
Preprocessing file Data/train/pos

Preprocessing file Data/train/pos/cv583_tok-19290.txt...
Preprocessing file Data/train/pos/cv585_tok-29721.txt...
Preprocessing file Data/train/pos/cv586_tok-20318.txt...
Preprocessing file Data/train/pos/cv587_tok-4670.txt...
Preprocessing file Data/train/pos/cv588_tok-10492.txt...
Preprocessing file Data/train/pos/cv589_tok-17484.txt...
Preprocessing file Data/train/pos/cv591_tok-12721.txt...
Preprocessing file Data/train/pos/cv592_tok-21878.txt...
Preprocessing file Data/train/pos/cv593_tok-12180.txt...
Preprocessing file Data/train/pos/cv594_tok-11418.txt...
Preprocessing file Data/train/pos/cv595_tok-7281.txt...
Preprocessing file Data/train/pos/cv596_tok-29066.txt...
Preprocessing file Data/train/pos/cv597_tok-16324.txt...
Preprocessing file Data/train/pos/cv598_tok-18159.txt...
Preprocessing file Data/train/pos/cv600_tok-23338.txt...
Preprocessing file Data/train/pos/cv601_tok-10066.txt...
Preprocessing file Data/train/pos/cv602_tok-22232.txt...
Preprocessing file Data/train/pos

Preprocessing file Data/train/neg/cv200_tok-14899.txt...
Preprocessing file Data/train/neg/cv201_tok-16077.txt...
Preprocessing file Data/train/neg/cv202_tok-15873.txt...
Preprocessing file Data/train/neg/cv203_tok-7773.txt...
Preprocessing file Data/train/neg/cv204_tok-6937.txt...
Preprocessing file Data/train/neg/cv205_tok-9909.txt...
Preprocessing file Data/train/neg/cv206_tok-15604.txt...
Preprocessing file Data/train/neg/cv207_tok-29808.txt...
Preprocessing file Data/train/neg/cv208_tok-17138.txt...
Preprocessing file Data/train/neg/cv209_tok-16563.txt...
Preprocessing file Data/train/neg/cv211_tok-20701.txt...
Preprocessing file Data/train/neg/cv212_tok-29271.txt...
Preprocessing file Data/train/neg/cv213_tok-10363.txt...
Preprocessing file Data/train/neg/cv214_tok-11934.txt...
Preprocessing file Data/train/neg/cv215_tok-21565.txt...
Preprocessing file Data/train/neg/cv216_tok-27832.txt...
Preprocessing file Data/train/neg/cv217_tok-29465.txt...
Preprocessing file Data/train/neg/

Preprocessing file Data/train/neg/cv450_tok-13677.txt...
Preprocessing file Data/train/neg/cv452_tok-18656.txt...
Preprocessing file Data/train/neg/cv453_tok-17410.txt...
Preprocessing file Data/train/neg/cv454_tok-11816.txt...
Preprocessing file Data/train/neg/cv455_tok-12222.txt...
Preprocessing file Data/train/neg/cv456_tok-29420.txt...
Preprocessing file Data/train/neg/cv457_tok-28466.txt...
Preprocessing file Data/train/neg/cv458_tok-19400.txt...
Preprocessing file Data/train/neg/cv460_tok-5367.txt...
Preprocessing file Data/train/neg/cv461_tok-17064.txt...
Preprocessing file Data/train/neg/cv462_tok-9843.txt...
Preprocessing file Data/train/neg/cv464_tok-26838.txt...
Preprocessing file Data/train/neg/cv465_tok-29619.txt...
Preprocessing file Data/train/neg/cv469_tok-15267.txt...
Preprocessing file Data/train/neg/cv470_tok-17819.txt...
Preprocessing file Data/train/neg/cv471_tok-18174.txt...
Preprocessing file Data/train/neg/cv472_tok-11943.txt...
Preprocessing file Data/train/neg

14123
