The codes can only run in linux environment but not in any version of windows environment.

In [1]:
import os
import sys
import random
import subprocess

In [2]:
"""Get all files name under path

Args:
    path: folder path to retrieve files' name.
    suffix: type of files
    shuffle: a boolean value. TRUE: shuffle list; False: order list.

Returns:
    filesName: a list of all files end with suffix. For example:

    ["dir/a.txt", "dir/b.txt"].
"""
def getFilesName( path, suffix = ".txt", shuffle = False ):
    print( "Retrieving files name from folder %s..." % ( path ) )
    filesName = []
    files = os.listdir( path )
    for file in files:
        if os.path.splitext( file )[1] == suffix:
            name = ''.join( [path, file] )
            filesName.append( name )
    if shuffle:
        random.shuffle( filesName )
    else:
        filesName.sort()
    return filesName

In [3]:
"""Get content from files

Extract all sentences from files, which words are less than
or equal to 50 and return a list containing all of them.

Args:
    filesName: a list of all files containing contents.
    encoding: the encoding of all files.

Returns:
    contents: a list of string contains all sentences.
"""
def getContents( filesName, encoding = "UTF-8" ):
    print( "Extracting contents..." )
    contents = []
    for fileName in filesName:
        with open( fileName, 'r', encoding = encoding ) as f:
            line = f.readline()
            while line:
                words = line.split()
                length = len( words )
                # Some lines contain only a '.' or even nothing.
                if 2 <= length and length <= 50:
                    contents.append( line.strip() )
                line = f.readline()
    return contents

In [4]:
"""Write contents to files

Write contents to specific files.

Args:
    contents: a list of string contains all sentences.
    savePath: path that contents will save in.
    fileName: the file name that contents will save to.

Returns:
    None.
"""
def writeContents( contents, savePath = "./save/", fileName = "sentences.txt",
                   encoding = "UTF-8" ):
    print( "Writing contents to " + savePath + fileName + "..." )
    if not os.path.isdir( savePath ):
        os.makedirs( savePath )
    with open( savePath + fileName, 'w', encoding = encoding ) as f:
        for content in contents:
            if content[-1] != '.' and content[-1] != '?':
                content += "."
            f.write( content + "\n" )

In [5]:
"""Parse 

Parse contents in files by calling script from standford parse tools.

Args:
    inputFileName: file name which is wating to parsed.
    inputPath: path of input file.
    saveFileName: file name which result will write to.
    savePath: path of save file.

Returns:
    None.
"""
def parse( inputFileName, inputPath = "../../Problem1/save/", saveFileName = "output",
           savePath = "../../Problem1/save/" ):
    print( "Parsing" )
    sh_pos = ['"./lexparser-POS.sh"', '"' + inputPath + inputFileName + '"',
              '> ', '"' + savePath + saveFileName + '-POS"']
    sh_pen = ['"./lexparser-PENN.sh"', '"' + inputPath + inputFileName + '"',
              '> ', '"' + savePath + saveFileName + '-PENN"']
    sh_pos = ' '.join( sh_pos )
    sh_pen = ' '.join( sh_pen )
    print( sh_pos )
    print( sh_pen )
    ret_pos = subprocess.Popen( sh_pos, shell = True,
                                  cwd = "../Tools/stanford-parser-full-2018-10-17/" )
    ret_pen = subprocess.Popen( sh_pen, shell = True,
                                  cwd = "../Tools/stanford-parser-full-2018-10-17/" )
    ret_pos.wait()
    ret_pen.wait()

In [6]:
"""Count on verb percentage in all files

Args:
    fileName: file need to be count on.
    encoding: the encoding of the file.

Returns:
    None.
"""
def verbPercentage( fileName, encoding = "UTF-8" ):
    verbs = {}
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        total = 0
        cnt = 0
        while line:
            if line.strip():
                cnt += 1
            words = line.strip().split()
            for word in words:
                word_tag = word.split( '/' )
                if len( word_tag ) == 2 and word_tag[1][0] == "V":
                    if word_tag[1] not in verbs:
                        verbs[word_tag[1]] = 1
                    total += 1
            line = f.readline()
    print( total / cnt, list( verbs.keys() ) )

In [7]:
"""Count on total sentences based on PENN tree

Count the number of "ROOT" in files.

Args:
    fileName: file need to be count on.
    encoding: the encoding of the file.

Returns:
    None.
"""
def noOfSentenceParsed( fileName, encoding = "UTF-8" ):
    cnt = 0
    with open( fileName, 'r', encoding = encoding ) as f:
        line = f.readline()
        while line:
            if line.strip()[:5] == "(ROOT":
                cnt += 1
            line = f.readline()
    print( cnt )

In [8]:
"""Count on preposition percentage in each file

Args:
    filesName: a list of files need to be count on.
    encoding: the encoding of the file.

Returns:
    None.
"""
def noOfPP( filesName, encoding = "UTF-8" ):
    for fileName in filesName:
        contents = getContents( [fileName], encoding = encoding )
        fileName = fileName.split( '/' )[-1].split( '.' )[0]
        newFileName = fileName + "-less50"
        writeContents( contents, "./save/less50/", newFileName )
        print( "Parsing" )
        path = "../../Problem1/save/less50/"
        newPath = "../../Problem1/save/less50-DEPENDENCY/"
        sh_pen = ['"./lexparser-DEPENDENCY.sh"', '"' + path + newFileName + '"',
                  '> ', '"' + newPath + newFileName + '-DEPENDENCY"']
        print( ' '.join( sh_pen ) )
        ret_pen = subprocess.Popen( ' '.join( sh_pen ), shell = True,
                                      cwd = "../Tools/stanford-parser-full-2018-10-17/" )
        ret_pen.wait()

"""Count on top 3 prepositions in all files

Args:
    filesName: a list of files need to be count on.
    encoding: the encoding of the file.

Returns:
    None.
"""
def calPP( filesName, encoding = "UTF-8" ):
    top3 = {}
    for fileName in filesName:
        with open( fileName, 'r' ) as f:
            totalPP = 0
            line = f.readline()
            while line:
                line = line.strip()
                if line[:4] == "case":
                    totalPP += 1
                    word = line.split( ", " )[1].split( "-" )[0].lower()
                    if word not in top3:
                        top3[word] = 0
                    top3[word] += 1
                line = f.readline()
            print( fileName, totalPP )
    top3 = sorted( top3.items(), key = lambda x: x[1], reverse = True )[:3]
    print( top3 )

In [9]:
def main():
    # Data preprocessing
    filesName = getFilesName( "data/corpus/" )
    contents = getContents( filesName )
    writeContents( contents )
    parse( "sentences.txt" )
    
    # Problem 1.1
    verbPercentage( "./save/output-POS" )
    
    # Problem 1.2
    noOfSentenceParsed( "./save/output-PENN" )
    
    # Problem 1.3
    noOfPP( filesName )
    filesName = getFilesName( "save/less50-DEPENDENCY/", suffix = "" )
    calPP( filesName )

In [10]:
if __name__ == "__main__":
    main()

Retrieving files name from folder data/corpus/...
Extracting contents...
Writing contents to ./save/sentences.txt...
Parsing
"./lexparser-POS.sh" "../../Problem1/save/sentences.txt" >  "../../Problem1/save/output-POS"
"./lexparser-PENN.sh" "../../Problem1/save/sentences.txt" >  "../../Problem1/save/output-PENN"
3.597727750894172 ['VBD', 'VBN', 'VBZ', 'VB', 'VBG', 'VBP']
14259
Extracting contents...
Writing contents to ./save/less50/0006-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0006-less50" >  "../../Problem1/save/less50-DEPENDENCY/0006-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0219-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0219-less50" >  "../../Problem1/save/less50-DEPENDENCY/0219-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/0381-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/0381-less50" >  "../../Problem1/save/less50-DEPENDENCY/

Extracting contents...
Writing contents to ./save/less50/3421-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/3421-less50" >  "../../Problem1/save/less50-DEPENDENCY/3421-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/3723-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/3723-less50" >  "../../Problem1/save/less50-DEPENDENCY/3723-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/3830-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/3830-less50" >  "../../Problem1/save/less50-DEPENDENCY/3830-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/3884-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/3884-less50" >  "../../Problem1/save/less50-DEPENDENCY/3884-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/4195-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/4195

Extracting contents...
Writing contents to ./save/less50/8136-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8136-less50" >  "../../Problem1/save/less50-DEPENDENCY/8136-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8154-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8154-less50" >  "../../Problem1/save/less50-DEPENDENCY/8154-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8161-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8161-less50" >  "../../Problem1/save/less50-DEPENDENCY/8161-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8251-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8251-less50" >  "../../Problem1/save/less50-DEPENDENCY/8251-less50-DEPENDENCY"
Extracting contents...
Writing contents to ./save/less50/8314-less50...
Parsing
"./lexparser-DEPENDENCY.sh" "../../Problem1/save/less50/8314

save/less50-DEPENDENCY/9184-less50-DEPENDENCY 388
save/less50-DEPENDENCY/9203-less50-DEPENDENCY 416
save/less50-DEPENDENCY/9366-less50-DEPENDENCY 338
save/less50-DEPENDENCY/9426-less50-DEPENDENCY 380
save/less50-DEPENDENCY/9508-less50-DEPENDENCY 410
save/less50-DEPENDENCY/9597-less50-DEPENDENCY 327
save/less50-DEPENDENCY/9629-less50-DEPENDENCY 315
save/less50-DEPENDENCY/9849-less50-DEPENDENCY 337
save/less50-DEPENDENCY/9880-less50-DEPENDENCY 335
save/less50-DEPENDENCY/9981-less50-DEPENDENCY 374
save/less50-DEPENDENCY/9994-less50-DEPENDENCY 396
[('of', 10262), ('in', 6870), ('to', 3212)]
