In [1]:
import os
import sys
import random
import subprocess

In [2]:
"""Get all files name under path

Args:
    path: folder path to retrieve files' name.
    ratio: propotion of training data. Default value is 1 (100%).
    suffix: type of files
    shuffle: a boolean value. TRUE: shuffle list; False: order list.

Returns:
    filesName: a list of all files end with suffix. For example:

    ["dir/a.txt", "dir/b.txt"].
"""
def getFilesName( path, ratio = 1, suffix = ".txt", shuffle = False ):
    print( "Retrieving files name from folder %s..." % ( path ) )
    filesName = []
    files = os.listdir( path )
    for file in files:
        if os.path.splitext( file )[1] == suffix:
            name = ''.join( [path, file] )
            filesName.append( name )
    if shuffle:
        random.shuffle( filesName )
    else:
        filesName.sort()
    return filesName

In [3]:
"""Get content from files

Extract all sentences from files, which words are less than
or equal to 50 and return a list containing all of them.

Args:
    filesName: a list of all files containing contents.
    encoding: the encoding of all files.

Returns:
    contents: a list of string contains all sentences.
"""
def getContents( filesName, encoding = "UTF-8" ):
    print( "Extracting contents..." )
    contents = []
    for fileName in filesName:
        with open( fileName, 'r', encoding = encoding ) as f:
            line = f.readline()
            while line:
                words = line.split()
                length = len( words )
                # Some lines contain only a '.' or even nothing.
                if 2 <= length and length <= 50:
                    contents.append( line.strip() )
                line = f.readline()
    return contents

In [4]:
"""Write contents to files

Write contents to specific files.

Args:
    contents: a list of string contains all sentences.
    savePath: path that contents will save in.
    fileName: the file name that contents will save to.

Returns:
    None.
"""
def writeContents( contents, savePath = "./save/", fileName = "sentences.txt",
                   encoding = "UTF-8" ):
    print( "Writing contents to " + savePath + fileName + "..." )
    if not os.path.isdir( savePath ):
        os.makedirs( savePath )
    with open( savePath + fileName, 'w', encoding = encoding ) as f:
        for content in contents:
            if content[-1] != '.' and content[-1] != '?':
                content += "."
            f.write( content + "\n" )

In [5]:
"""Parse 

Parse contents in files by calling script from standford parse tools.

Args:
    inputFileName: file name which is wating to parsed.
    inputPath: path of input file.
    saveFileName: file name which result will write to.
    savePath: path of save file.

Returns:
    None.
"""
def parse( inputFileName, inputPath = "../../Problem1/save/", saveFileName = "output",
           savePath = "../../Problem1/save" ):
    print( "Parsing" )
    sh_pos = ['"./lexparser-POS.sh"', '"' + inputPath + inputFileName + '"',
              '> ', '"' + savePath + saveFileName + '-POS"']
    sh_pen = ['"./lexparser-PENN.sh"', '"' + inputPath + inputFileName + '"',
              '> ', '"' + savePath + saveFileName + '-PENN"']
    sh_dep = ['"./lexparser-DEPENDENCY.sh"', '"' + inputPath + inputFileName + '"',
              '> ', '"' + savePath + saveFileName + '-DEPENDENCY"']
    ret_pos = subprocess.Popen( ' '.join( sh_pos ), shell = True,
                                  cwd = "../Tools/stanford-parser-full-2018-10-17/" )
    ret_pen = subprocess.Popen( ' '.join( sh_pen ), shell = True,
                                  cwd = "../Tools/stanford-parser-full-2018-10-17/" )
    ret_dep = subprocess.Popen( ' '.join( sh_dep ), shell = True,
                                  cwd = "../Tools/stanford-parser-full-2018-10-17/" )
    ret_pos.wait()
    ret_pen.wait()
    ret_dep.wait()

In [6]:
def problem1_3( filesName, encoding = "UTF-8" ):
    for fileName in filesName:
        contents = getContents( [fileName], encoding = encoding )
        fileName = fileName.split( '/' )[-1].split( '.' )[0]
        newFileName = fileName + "-less50"
        writeContents( contents, "./save/less50/", newFileName )
        print( "Parsing" )
        path = "../../Problem1/save/less50/"
        sh_pen = ['"./lexparser-DEPENDENCY.sh"', '"' + path + newFileName + '"',
                  '> ', '"' + path + newFileName + '-DEPENDENCY"']
        print( ' '.join( sh_pen ) )
        ret_pen = subprocess.Popen( ' '.join( sh_pen ), shell = True,
                                      cwd = "../Tools/stanford-parser-full-2018-10-17/" )
        ret_pen.wait()
#         with open( "save/less50/" + newFileName + "-DEPENDENCY", 'r' ) as f:
#             line = f.readline()
#             while line:
#                 words = line.strip().split()
#                 lien = f.readline()
#             pass

In [7]:
def verbPercentage( fileName, encoding = "UTF-8" ):
    verbs = {}
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        total = 0
        cnt = 0
        while line:
            if line.strip():
                cnt += 1
            words = line.strip().split()
            for word in words:
                word_tag = word.split( '/' )
                if len( word_tag ) == 2 and word_tag[1][0] == "V":
                    if word_tag[1] not in verbs:
                        verbs[word_tag[1]] = 1
                    total += 1
            line = f.readline()
    return total / cnt, list( verbs.keys() )

In [8]:
def noOfSentenceParsed( fileName, encoding = "UTF-8" ):
    with open( fileName, 'r', encoding = encoding ) as f:
        cnt = 0
        line = f.readline()
        while line:
            if line.strip()[:5] == "(ROOT":
                cnt += 1
            line = f.readline()
    return cnt

In [9]:
def main():
    filesName = getFilesName( "Data/corpus/" )
#     contents = getContents( filesName )
#     writeContents( contents )
#     parse( "sentences.txt" )
    problem1_3( filesName )
#     print( verbPercentage( "./save/output-POS-UTF8" ) )
#     print( noOfSentenceParsed( "./save/output-PENN-UTF8" ) )

In [10]:
if __name__ == "__main__":
    main()

Retrieving files name from folder Data/corpus/...
Extracting contents...
Writing contents to ./save/less50/0006-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0006-less50" >  "../../Problem1/save/less50/0006-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0219-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0219-less50" >  "../../Problem1/save/less50/0219-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0381-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0381-less50" >  "../../Problem1/save/less50/0381-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0403-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0403-less50" >  "../../Problem1/save/less50/0403-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0404-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less5

Extracting contents...
Writing contents to ./save/less50/3884-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/3884-less50" >  "../../Problem1/save/less50/3884-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/4195-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/4195-less50" >  "../../Problem1/save/less50/4195-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/4362-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/4362-less50" >  "../../Problem1/save/less50/4362-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/4469-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/4469-less50" >  "../../Problem1/save/less50/4469-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/4580-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/4580-less50" >  "../../Problem1/save/less50/4580

Extracting contents...
Writing contents to ./save/less50/8376-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8376-less50" >  "../../Problem1/save/less50/8376-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8576-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8576-less50" >  "../../Problem1/save/less50/8576-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8649-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8649-less50" >  "../../Problem1/save/less50/8649-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8756-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8756-less50" >  "../../Problem1/save/less50/8756-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8764-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8764-less50" >  "../../Problem1/save/less50/8764