In [89]:
import re
import nltk
import gensim
import numpy as np
import pandas as pd
import sklearn as sl
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats.stats import pearsonr

In [3]:
data_train = pd.read_excel( "Data/Train_Data.xlsx", sheet_name = "Sheet1" )
data_test  = pd.read_excel( "Data/Test_Data.xlsx", sheet_name = "Sheet1" )
stop_words = set( nltk.corpus.stopwords.words( "english" ) )

In [4]:
word2vec = gensim.models.KeyedVectors.load_word2vec_format(
                "Data/GoogleNews-vectors-negative300.bin", binary = True )

In [269]:
"""Preprocess all data

1. Replace tab or new line characters with space
2. Lowercase words
3. Remove extra spaces

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    None.
"""
def preprocessing( data ):
    print( "Preprocessing..." )
    summaries = data["Summary"]
    new_summaries = []
    for summary in summaries:
        summary = re.sub( "[\s▃]", " ", summary )
        summary = re.sub( "_", "", summary )
        summary = re.sub( "[^\w\s]", "", summary )
        summary = re.sub( "\s+", " ", summary ).strip()
        summary.lower()
        new_summaries.append( summary )
    data["Summary"] = new_summaries

In [270]:
"""Get max unigram of each sentence

Maximum repetition of unigrams: calculate the frequencies of all unigrams
(remove stop words), and use the maximum value as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    unigrams: a list of max unigram of each sentence corresponding to
              original sentences.
"""
def get_max_unigram( data, stop_words ):
    print( "Geting maximum repetition of unigrams of each sentence..." )
    summaries = data["Summary"]
    unigrams = []
    for summary in summaries:
        unigram = {}
        words = summary.split()
        max_number = 0
        for word in words:
            if word in stop_words:
                continue
            if word not in unigram:
                unigram[word] = 0
            unigram[word] += 1
            max_number = max( max_number, unigram[word] )
        unigrams.append( max_number )
    unigrams = np.array( unigrams )
    unigrams = ( unigrams - np.mean( unigrams ) ) / np.std( unigrams )
    unigrams = unigrams.tolist()
    return unigrams

"""Get max bigram of each sentence

Maximum repetition of bigrams: calculate the frequencies of all bigrams,
and use the maximum value as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    bigrams: a list of max bigram of each sentence corresponding to
            original sentences.
"""
def get_max_bigram( data, stop_words ):
    print( "Geting maximum repetition of bigrams of each sentence..." )
    summaries = data["Summary"]
    bigrams = []
    for summary in summaries:
        bigram = {}
        words = summary.split()
        max_number = 0
        prev_word = ""
        for word in words:
            if word in stop_words:
                continue
            if prev_word == "":
                prev_word = word
                continue
            two_words = ' '.join( [prev_word, word] )
            if two_words not in bigram:
                bigram[two_words] = 0
            bigram[two_words] += 1
            max_number = max( max_number, bigram[two_words] )
            prev_word = word
        bigrams.append( max_number )
    bigrams = np.array( bigrams )
    bigrams = ( bigrams - np.mean( bigrams ) ) / np.std( bigrams )
    bigrams = bigrams.tolist()
    return bigrams

"""Get max unigram of each sentence

Maximum sentence similarity: each sentence is represented as average of
word embeddings, then compute cosine similarity between pairwise sentences,
use the maximum similarity as the result.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.

Returns:
    sen_sim: a list of max sentence similarity of each sentence.
"""
def get_max_sentence_similarity( data, stop_words, word2vec ):
    print( "Geting maximum sentence similarity of each sentence..." )
    summaries = data["Summary"]
    word2vec_dict = word2vec.vocab.keys()
    sen_sim = []
    avgvecs = []
    for summary in summaries:
        words = summary.split()
        # 300 dims in word2vec for each word-0vector
        wordvecs = []
        for word in words:
            if word not in word2vec_dict:
                continue
            wordvecs.append( word2vec[word] )
        wordvecs = np.array( wordvecs )
        avgvecs.append( np.mean( wordvecs, axis = 0 ) )
    print( "Calculating cosine similarity..." )
    for i in range( len( avgvecs ) ):
        max_cos_sim = -100
        for j in range( len( avgvecs ) ):
            if i == j:
                continue
            cos_sim = 1 - spatial.distance.cosine( avgvecs[i], avgvecs[j] )
            max_cos_sim = max( max_cos_sim, cos_sim )
        sen_sim.append( max_cos_sim )
    return sen_sim

"""Get length for each sentence

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.

Returns:
    lengths: a list of length of each summary.
"""
def get_length( data ):
    print( "Getting length of each sentence..." )
    summaries = data["Summary"]
    lengths = []
    for summary in summaries:
        lengths.append( len( summary ) )
    return lengths

"""Get ratio of stop words in each sentence

Calculate ratio of stop words in each sentences.

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.

Returns:
    ratio_of_stop_words: a list of ratio of stop words in each
                         sentence.
"""
def get_ratio_of_stop_words( data, stop_words ):
    print( "Getting ratio of stop words of each sentence..." )
    summaries = data["Summary"]
    ratio_of_stop_words = []
    for summary in summaries:
        words = summary.split()
        cnt = 0
        for word in words:
            if word in stop_words:
                cnt += 1
        ratio_of_stop_words.append( cnt / len( word ) )
    return ratio_of_stop_words

In [276]:
"""Train part of problem 4.1.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.

Returns:
    None.
"""
def problem4_1_1_train( data, stop_words, word2vec ):
    print( "Problem 4.1.1 Train" )
    print( "Getting features..." )
    features = list( zip( get_max_unigram( data, stop_words ),
                          get_max_bigram( data, stop_words ),
                          get_max_sentence_similarity( data, stop_words,
                                                       word2vec ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 10, 3 ), activation = "tanh",
                          max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

"""Test part of problem 4.1.1

Args:
    data: a dataframe contains Summary text, Non-Redundancy score,
          and Fluency score.
    model: a trained model.
    stop_words: a set of stop words.
    word2vec: a word2vec model obtained from nltk.

Returns:
    None.
"""
def problem4_1_1_test( data, model, stop_words, word2vec ):
    print( "Problem 4.1.1 Test" )
    print( "Getting features..." )
    features = list( zip( get_max_unigram( data, stop_words ),
                          get_max_bigram( data, stop_words ),
                          get_max_sentence_similarity( data, stop_words,
                                                       word2vec ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Predicting..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [277]:
preprocessing( data_train )
model_4_1_1 = problem4_1_1_train( data_train, stop_words, word2vec )
preprocessing( data_test )
problem4_1_1_test( data_test, model_4_1_1, stop_words, word2vec )

Preprocessing...
Problem 4.1.1 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Geting maximum repetition of bigrams of each sentence...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Building and training models...
Preprocessing...
Problem 4.1.1 Test
Getting features...
Geting maximum repetition of unigrams of each sentence...
Geting maximum repetition of bigrams of each sentence...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Predicting...
Measuring...
0.23029351476520904 (0.6834279551731149, 7.177369025815646e-29)


In [278]:
def problem4_1_2_train( data, stop_words, word2vec, func ):
    print( "Problem 4.1.2 Train" )
    print( "Getting features..." )
    if func == "length":
        features = list( zip( get_max_unigram( data, stop_words ),
                              get_max_bigram( data, stop_words ),
                              get_max_sentence_similarity( data, stop_words,
                                                           word2vec ),
                              get_length( data ) ) )
    else:
        features = list( zip( get_max_unigram( data, stop_words ),
                              get_max_bigram( data, stop_words ),
                              get_max_sentence_similarity( data, stop_words,
                                                           word2vec ),
                              get_ratio_of_stop_words( data, stop_words ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Building and training models..." )
    model = MLPRegressor( ( 10, 3 ), activation = "tanh",
                           max_iter = 1000, learning_rate_init = 0.01 )
    model.fit( features, labels )
    return model

def problem4_1_2_test( data, model, stop_words, word2vec, func ):
    print( "Problem 4.1.2 Test" )
    print( "Getting features..." )
    if func == "length":
        features = list( zip( get_max_unigram( data, stop_words ),
                              get_max_bigram( data, stop_words ),
                              get_max_sentence_similarity( data, stop_words,
                                                           word2vec ),
                              get_length( data ) ) )
    else:
        features = list( zip( get_max_unigram( data, stop_words ),
                              get_max_bigram( data, stop_words ),
                              get_max_sentence_similarity( data, stop_words,
                                                           word2vec ),
                              get_ratio_of_stop_words( data, stop_words ) ) )
    labels = list( data["Non-Redundancy"] )
    print( "Predicting by model..." )
    preds = model.predict( features )
    print( "Measuring..." )
    mses = mean_squared_error( labels, preds )
    pcor = pearsonr( labels, preds )
    print( mses, pcor )

In [279]:
func = "length"
print( "Try to add feature " + func )
preprocessing( data_train )
model_4_1_2_1 = problem4_1_2_train( data_train, stop_words, word2vec, func )
preprocessing( data_test )
problem4_1_2_test( data_test, model_4_1_2_1, stop_words, word2vec, func )
func = "stop_words"
print( "Try to add feature " + func )
preprocessing( data_train )
model_4_1_2_2 = problem4_1_2_train( data_train, stop_words, word2vec, func )
preprocessing( data_test )
problem4_1_2_test( data_test, model_4_1_2_2, stop_words, word2vec, func )

Try to add feature length
Preprocessing...
Problem 4.1.2 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Geting maximum repetition of bigrams of each sentence...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Getting length of each sentence...
Building and training models...
Preprocessing...
Problem 4.1.2 Test
Getting features...
Geting maximum repetition of unigrams of each sentence...
Geting maximum repetition of bigrams of each sentence...
Geting maximum sentence similarity of each sentence...
Calculating cosine similarity...
Getting length of each sentence...
Predicting by model...
Measuring...
0.40044965179218933 (0.057959911971320874, 0.4149436630978265)
Try to add feature stop_words
Preprocessing...
Problem 4.1.2 Train
Getting features...
Geting maximum repetition of unigrams of each sentence...
Geting maximum repetition of bigrams of each sentence...
Geting maximum sentence similarity of each sentence.