In [1]:
import os
import re
import nltk
import gensim
import readability
import numpy as np
import pandas as pd
import sklearn as sl
from nltk.parse import stanford
from scipy import spatial
from scipy.stats.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

In [2]:
os.environ['STANFORD_PARSER'] = "../Tools/stanford-parser-full-2018-10-17/stanford-parser.jar"
os.environ['STANFORD_MODELS'] = "../Tools/stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar"
parser = stanford.StanfordParser( model_path = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" )

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  This is separate from the ipykernel package so we can avoid doing imports until


In [3]:
data_train = pd.read_excel( "Data/Train_Data.xlsx", sheet_name = "Sheet1" )
data_test  = pd.read_excel( "Data/Test_Data.xlsx", sheet_name = "Sheet1" )
stop_words = set( nltk.corpus.stopwords.words( "english" ) )

In [4]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
                "Data/GoogleNews-vectors-negative300.bin", binary = True )

In [5]:
# Basic data processing functions
"""Clean data

1. Replace tab or new line characters with space
2. Lowercase words
3. Remove extra spaces

Args:
    line: a string contains original sentence(s) or content(s).
    
Returns:
    line: a string contains cleaned sentence(s) or content(s).
"""
def clean( line ):
    line = re.sub( "[\s▃]", " ", line )
    line = re.sub( "_", "", line )
    line = re.sub( "[^\w\s]", "", line )
    line = re.sub( "\s+", " ", line ).strip()
    line.lower()
    return line

"""Preprocess all data in summary level

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    new_summaries: a list of string which contains cleaned summary.
"""
def preprocess( data ):
    print( "Preprocessing..." )
    summaries = data["Summary"]
    new_summaries = []
    for summary in summaries:
        new_summaries.append( clean( summary ) )
    return new_summaries

"""Preprocess all data in sentence level

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    new_summaries: a list of cleaned summary. Its structure is:
    
                   [[sentence1], [sentence2], ...]
"""
def preprocess_sentence( data ):
    # Split summaries into sentences and clean them
    summaries = data["Summary"]
    new_summaries = []
    for summary in summaries:
        sentences = summary.split( '.' )
        new_sentences = []
        for sentence in sentences:
            sentence = sentence.strip()
            new_sentence = clean( sentence )
            if len( new_sentence ) == 0:
                continue
            new_sentences.append( new_sentence )
        new_summaries.append( new_sentences )
    return new_summaries

"""Standardize data

Args:
    data: a numpy array.

Returns:
    std_data: a numpy data contains stardaized data.
"""
def standardize( data ):
    if isinstance( data, list ):
        tdata = np.array( data )
    std_data = ( tdata - np.mean( tdata ) ) / np.std( tdata )
    if isinstance( data, list ):
        std_data = std_data.tolist()
    return std_data

In [6]:
# Utils for problem 4.1
"""Get max unigram of each sentence

Maximum repetition of unigrams: calculate the frequencies of all unigrams
(remove stop words), and use the maximum value as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    unigrams: a list of max unigram of each sentence corresponding to
              original sentences.
"""
def get_max_unigram( data, stop_words ):
    print( "Geting maximum repetition of unigrams of each sentence..." )
    summaries = preprocess( data )
    unigrams = []
    for summary in summaries:
        unigram = {}
        words = summary.split()
        max_number = 0
        for word in words:
            if word in stop_words:
                continue
            if word not in unigram:
                unigram[word] = 0
            unigram[word] += 1
            max_number = max( max_number, unigram[word] )
        unigrams.append( max_number )
    unigrams = standardize( unigrams )
    return unigrams

"""Get max bigram of each sentence

Maximum repetition of bigrams: calculate the frequencies of all bigrams,
and use the maximum value as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    bigrams: a list of max bigram of each sentence corresponding to
            original sentences.
"""
def get_max_bigram( data, cache = None ):
    print( "Geting maximum repetition of bigrams of each sentence..." )
    summaries = preprocess( data )
    bigrams = []
    for summary in summaries:
        bigram = {}
        words = summary.split()
        max_number = 0
        prev_word = ""
        for i in range( len( words ) - 1 ):
            two_words = ' '.join( words[i: i + 2] )
            if two_words not in bigram:
                bigram[two_words] = 0
            bigram[two_words] += 1
            max_number = max( max_number, bigram[two_words] )
        bigrams.append( max_number )
    bigrams = standardize( bigrams )
    return bigrams

"""Get max unigram of each sentence

Maximum sentence similarity: each sentence is represented as average of
word embeddings, then compute cosine similarity between pairwise sentences,
use the maximum similarity as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    word2vec: a word2vec model obtained from nltk.

Returns:
    sen_sim: a list of max sentence similarity of each sentence.
"""
def get_max_sentence_similarity( data, word2vec ):
    print( "Geting maximum sentence similarity of each sentence..." )
    
    summaries = preprocess_sentence( data )
    
    # Calculate the average word embedding for sentences
    word2vec_dict = word2vec.vocab.keys()
    avgvecs_units = []
    for summary in summaries:
        avgvecs_sentences = []
        for sentence in summary:
            words = sentence.split()
            # 300 dims in word2vec for each word-0vector
            wordvecs = []
            for word in words:
                if word in word2vec_dict:
                    wordvecs.append( word2vec[word] )
            if len( wordvecs ) == 0:
                continue
            wordvecs = np.array( wordvecs )
            avgvecs_sentences.append( np.mean( wordvecs, axis = 0 ) )
        avgvecs_units.append( avgvecs_sentences )
    
    # Calculate max sentence similarity
    print( "Calculating cosine similarity..." )
    sen_sims = []
    for avgvecs in avgvecs_units:
        max_cos_sim = -100
        for i in range( len( avgvecs ) ):
            for j in range( len( avgvecs ) ):
                if i == j:
                    continue
                cos_sim = 1 - spatial.distance.cosine( avgvecs[i], avgvecs[j] )
                max_cos_sim = max( max_cos_sim, cos_sim )
        sen_sims.append( max_cos_sim )
    sen_sims = standardize( sen_sims )
    return sen_sims

"""Get length for each sentence

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    lengths: a list of length of each summary.
"""
def get_length( data ):
    print( "Getting length of each sentence..." )
    summaries = preprocess( data )
    lengths = []
    for summary in summaries:
        lengths.append( len( summary.split() ) )
    lengths = standardize( lengths )
    return lengths

"""Get ratio of stop words in each sentence

Calculate ratio of stop words in each sentences.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    ratio_of_stop_words: a list of ratio of stop words in each
                         sentence.
"""
def get_ratio_of_stop_words( data, stop_words ):
    print( "Getting ratio of stop words of each sentence..." )
    summaries = preprocess( data )
    ratio_of_stop_words = []
    for summary in summaries:
        words = summary.split()
        cnt = 0
        for word in words:
            if word in stop_words:
                cnt += 1
        ratio_of_stop_words.append( cnt / len( word ) )
    ratio_of_stop_words = standardize( ratio_of_stop_words )
    return ratio_of_stop_words

In [7]:
"""Train part of problem 4.1.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.

Returns:
    model: a well-trained mlp model.
"""
def problem4_1_1_train( data, stop_words, word2vec ):
    print( "Problem 4.1.1 Train" )
    print( "Getting features..." )
    caches = {}
    max_unigram = get_max_unigram( data, stop_words )
    max_bigram  = get_max_bigram( data )
    max_sen_sim = get_max_sentence_similarity( data, word2vec )
    features = list( zip( max_unigram, max_bigram, max_sen_sim ) )
    labels = list( data["Non-Redundancy"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 5, ), activation = "tanh",
                          max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

"""Test part of problem 4.1.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    model: a trained model.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.

Returns:
    None.
"""
def problem4_1_1_test( data, model, stop_words, word2vec ):
    print( "Problem 4.1.1 Test" )
    print( "Getting features..." )
    max_unigram = get_max_unigram( data, stop_words )
    max_bigram  = get_max_bigram( data )
    max_sen_sim = get_max_sentence_similarity( data, word2vec )
    features = list( zip( max_unigram, max_bigram, max_sen_sim ) )
    labels = list( data["Non-Redundancy"] )
    print( "Predicting..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [8]:
model_4_1_1 = problem4_1_1_train( data_train, stop_words, word2vec )
problem4_1_1_test( data_test, model_4_1_1, stop_words, word2vec )

Problem 4.1.1 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Preprocessing...
Geting maximum repetition of bigrams of each sentence...
Preprocessing...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Building and training models...
Problem 4.1.1 Test
Getting features...
Geting maximum repetition of unigrams of each sentence...
Preprocessing...
Geting maximum repetition of bigrams of each sentence...
Preprocessing...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Predicting...
Measuring...
0.18254632671481427 (0.7406327889583803, 4.889763023965826e-36)


In [9]:
"""Train part of problem 4.1.2

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.
    func: a string represents which function it would use.

Returns:
    model: a well-trained MLP model.
"""
def problem4_1_2_train( data, stop_words, word2vec, func ):
    print( "Problem 4.1.2 Train" )
    print( "Getting features..." )
    if func == "length":
        max_unigram = get_max_unigram( data, stop_words )
        max_bigram  = get_max_bigram( data )
        max_sen_sim = get_max_sentence_similarity( data, word2vec )
        length      = get_length( data )
        features = list( zip( max_unigram, max_bigram, max_sen_sim, length ) )
    else:
        max_unigram = get_max_unigram( data, stop_words )
        max_bigram  = get_max_bigram( data )
        max_sen_sim = get_max_sentence_similarity( data, word2vec )
        ratio       = get_ratio_of_stop_words( data, stop_words )
        features = list( zip( max_unigram, max_bigram, max_sen_sim, ratio ) )
    labels = list( data["Non-Redundancy"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 5, ), activation = "tanh",
                           max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

"""Test part of problem 4.1.2

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    model: a trained model.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.
    func: a string represents which function it would use.

Returns:
    None.
"""
def problem4_1_2_test( data, model, stop_words, word2vec, func ):
    print( "Problem 4.1.2 Test" )
    print( "Getting features..." )
    max_unigram = get_max_unigram( data, stop_words )
    max_bigram  = get_max_bigram( data )
    max_sen_sim = get_max_sentence_similarity( data, word2vec )
    if func == "length":
        length = get_length( data )
        features = list( zip( max_unigram, max_bigram, max_sen_sim, length ) )
    else:
        ratio = get_ratio_of_stop_words( data, stop_words )
        features = list( zip( max_unigram, max_bigram, max_sen_sim, ratio ) )
    labels = list( data["Non-Redundancy"] )
    print( "Predicting by model..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [10]:
func = "length"
print( "Try to add feature " + func )
model_4_1_2_1 = problem4_1_2_train( data_train, stop_words, word2vec, func )
problem4_1_2_test( data_test, model_4_1_2_1, stop_words, word2vec, func )
func = "stop_words"
print( "Try to add feature " + func )
model_4_1_2_2 = problem4_1_2_train( data_train, stop_words, word2vec, func )
problem4_1_2_test( data_test, model_4_1_2_2, stop_words, word2vec, func )

Try to add feature length
Problem 4.1.2 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Preprocessing...
Geting maximum repetition of bigrams of each sentence...
Preprocessing...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Getting length of each sentence...
Preprocessing...
Building and training models...
Problem 4.1.2 Test
Getting features...
Geting maximum repetition of unigrams of each sentence...
Preprocessing...
Geting maximum repetition of bigrams of each sentence...
Preprocessing...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Getting length of each sentence...
Preprocessing...
Predicting by model...
Measuring...
0.17576888222252435 (0.7552195055635517, 3.556978978490383e-38)
Try to add feature stop_words
Problem 4.1.2 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Preprocessing...
Geting maximum repetition of bigrams of 

In [16]:
# Utils for problem 4.2
"""Get number of repetitive unigram

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    cnts: a list contains stardardized count value of all sentences.
"""
def get_no_of_repetitive_unigram( data ):
    cnts = []
    summaries = preprocess( data )
    for summary in summaries:
        words = summary.split()
        unigram = {}
        cnt = 0
        for word in words:
            if word not in unigram:
                unigram[word] = 0
            if unigram[word]:
                cnt += 1
            unigram[word] += 1
        cnts.append( cnt )
    cnts = standardize( cnts )
    return cnts

"""Get number of repetitive bigram

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    cnts: a list contains stardardized count value of all sentences.
"""
def get_no_of_repetitive_bigram( data ):
    cnts = []
    summaries = preprocess( data )
    for summary in summaries:
        words = summary.split()
        bigram = {}
        cnt = 0
        for i in range( len( words ) ):
            two_words = ' '.join( words[i:i + 2] )
            if two_words not in bigram:
                bigram[two_words] = 0
            if bigram[two_words]:
                cnt += 1
            bigram[two_words] += 1
        cnts.append( cnt )
    cnts = standardize( cnts )
    return cnts

"""Get easy read score

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    scores: a list contains stardardized scores of all sentences.
"""
def get_easy_read_score( data ):
    scores = []
    summaries = preprocess_sentence( data )
    for sentences in summaries:
        min_score = 10000
        for sentence in sentences:
            sentence = sentence.strip()
            if len( sentence ) == 0:
                continue
            words = sentence.split()
            score = readability.getmeasures( words, lang = 'en' )
            score = score['readability grades']['FleschReadingEase']
            min_score = min( min_score, score )
        scores.append( min_score )
    scores = standardize( scores )
    return scores

"""Get parser height of all sentences

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    parser: a stanford parser for parsing english PENN tree.

Returns:
    heights: a list contains stardardized heights of all sentences.
"""
def get_parser_height( data, parser ):
    summaries = preprocess_sentence( data )
    heights = []
    cnt = 0
    for sentences in summaries:
        height = []
        for sentence in sentences:
            sentence = sentence.strip()
            if len( sentence ) == 0:
                continue
            parsed_sentence = parser.parse_sents( sentence )
            height.append( next( next( parsed_sentence ) ).height () )
            cnt += 1
            print( "Parsed sentences: " + str( cnt ), end = "\r" )
        heights.append( np.mean( height ) )
    heights = standardize( heights )
    print( "" )
    return heights

"""Get phrase propotion of all sentences

For each summary, calculate the ratio of all pos-tags starts with "IN",
"NN", or "VB".

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    phrase_ratio: a list contains stardardized phrase ratio of all
                  sentences.
"""
def get_phrase_propotion( data ):
    summaries = preprocess_sentence( data )
    phrase_ratio = []
    total_cnt = 0
    for sentences in summaries:
        ratio = []
        cnt = 0
        cnt_tag = 0
        for sentence in sentences:
            sentence = sentence.strip()
            if len( sentence ) == 0:
                continue
            pos_tags = nltk.pos_tag( sentence )
            for pos_tag in pos_tags:
                cnt += 1
                if pos_tag[1].startswith( "IN" ) or \
                        pos_tag[1].startswith( "NN" ) or \
                        pos_tag[1].startswith( "VB" ):
                    cnt_tag += 1
            total_cnt += 1
            print( "Parsed sentence: " + str( total_cnt ), end = "\r" )
        phrase_ratio.append( cnt_tag / cnt )
    phrase_ratio = standardize( phrase_ratio )
    print( "" )
    return phrase_ratio

In [12]:
"""Train part of problem 4.2.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    model: a well-trained MLP model.
"""
def problem4_2_1_train( data, stop_words ):
    print( "Problem 4.2.1 Train" )
    print( "Getting features..." )
    rep_uni  = get_no_of_repetitive_unigram( data )
    rep_bi   = get_no_of_repetitive_bigram( data )
    rd_score = get_easy_read_score( data )
    features = list( zip( rep_uni, rep_bi, rd_score ) )
    labels = list( data["Fluency"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 5, ), activation = "tanh",
                          max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

"""Test part of problem 4.2.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    model: a trained model.
    stop_words: a set of stop words.

Returns:
    None.
"""
def problem4_2_1_test( data, model, stop_words ):
    print( "Problem 4.2.1 Test" )
    print( "Getting features..." )
    rep_uni  = get_no_of_repetitive_unigram( data )
    rep_bi   = get_no_of_repetitive_bigram( data )
    rd_score = get_easy_read_score( data )
    features = list( zip( rep_uni, rep_bi, rd_score ) )
    labels = list( data["Fluency"] )
    print( "Predicting..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [13]:
model_4_2_1 = problem4_2_1_train( data_train, stop_words )
problem4_2_1_test( data_test, model_4_2_1, stop_words )

Problem 4.2.1 Train
Getting features...
Preprocessing...
Preprocessing...
Building and training models...
Problem 4.2.1 Test
Getting features...
Preprocessing...
Preprocessing...
Predicting...
Measuring...
0.23843681713639953 (0.3007707323664851, 1.508068668805849e-05)


In [14]:
"""Train part of problem 4.2.2

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.
    parser: a stanford parser for parsing english PENN tree.
    func: a string represents which function it would use.

Returns:
    model: a well-trained MLP model.
"""
def problem4_2_2_train( data, stop_words, parser, func ):
    print( "Problem 4.2.2 Train" )
    print( "Getting features..." )
    if func == "height":
        features = list( zip( get_no_of_repetitive_unigram( data ),
                              get_no_of_repetitive_bigram( data ),
                              get_easy_read_score( data ),
                              get_parser_height( data, parser ) ) )
    else:
        features = list( zip( get_no_of_repetitive_unigram( data ),
                              get_no_of_repetitive_bigram( data ),
                              get_easy_read_score( data ),
                              get_phrase_propotion( data ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 5, ), activation = "tanh",
                           max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

"""Test part of problem 4.2.2

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    model: a trained model.
    stop_words: a set of stop words.
    parser: a stanford parser for parsing english PENN tree.
    func: a string represents which function it would use.

Returns:
    None.
"""
def problem4_2_2_test( data, model, stop_words, parser, func ):
    print( "Problem 4.2.2 Test" )
    print( "Getting features..." )
    if func == "height":
        features = list( zip( get_no_of_repetitive_unigram( data ),
                              get_no_of_repetitive_bigram( data ),
                              get_easy_read_score( data ),
                              get_parser_height( data, parser ) ) )
    else:
        features = list( zip( get_no_of_repetitive_unigram( data ),
                              get_no_of_repetitive_bigram( data ),
                              get_easy_read_score( data ),
                              get_phrase_propotion( data ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Predicting by model..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [15]:
func = "height"
print( "Try to add feature " + func )
model_4_2_2_1 = problem4_2_2_train( data_train, stop_words, parser, func )
problem4_2_2_test( data_test, model_4_2_2_1, stop_words, parser, func )
func = "phrase ratio"
print( "Try to add feature " + func )
model_4_2_2_2 = problem4_2_2_train( data_train, stop_words, parser, func )
problem4_2_2_test( data_test, model_4_2_2_2, stop_words, parser, func )

Try to add feature height
Problem 4.2.2 Train
Getting features...
Preprocessing...
Preprocessing...
Parsed sentences: 2969
Building and training models...
Problem 4.2.2 Test
Getting features...
Preprocessing...
Preprocessing...
Parsed sentences: 753
Predicting by model...
Measuring...
0.21611216423667343 (0.6934429160060858, 5.280128013577299e-30)
Try to add feature phrase ratio
Problem 4.2.2 Train
Getting features...
Preprocessing...
Preprocessing...
Parsed sentence: 2969
Building and training models...
Problem 4.2.2 Test
Getting features...
Preprocessing...
Preprocessing...
Parsed sentence: 753
Predicting by model...
Measuring...
0.21223546613859498 (0.6965887743225773, 2.2752212536565094e-30)
