In [0]:
import os
import sys
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from google.colab import drive
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from keras.layers import Add, LSTM, Bidirectional, Input, Embedding, Dense, Concatenate, TimeDistributed, Activation, Permute, Flatten, Multiply
from keras.models import Sequential,Model
import numpy as np


In [43]:
drive.mount('/content/gdrive/')

root_path = '/content/gdrive/My Drive/Colab Notebooks/nn-homework2'
sys.path.append(root_path)

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
class DataFetcher(object):

    def __init__(self, data_preproccessor, max_data_count):
        self.data_preproccessor = data_preproccessor
        self.max_data_count = max_data_count
        return super().__init__()

    def fetch(self, filename, categories):
        text = self._read_file(filename)
        text_array = self._text_to_array(text)
        data = self._split_text_array_to_categories(text_array, categories)
        return data

    def _read_file(self, file_name):
        with open(file_name,'r', encoding = "ISO-8859-1") as readin:
            text = readin.read()
        return text

    def _text_to_array(self, text):
        return list(filter(lambda x: x != "", text.split("\n")))

    def _split_text_array_to_categories(self, text_array, categories):
        data = {}
        for category in categories:
            data[category] = []
        data_count = 0
        for i in range(len(text_array)):
            if (i + 1) % 8 == 0 and i != 0 and data_count < self.max_data_count:
                for j in range(len(categories)):
                    valid_clean_data = self.data_preproccessor.clean(text_array[i - j], categories[j])
                    data[categories[j]].append(valid_clean_data)
                data_count += 1
        return data
      

In [0]:
class DataPreproccessor(object):

    contraction_mapping = {
        "ain't":"is not","aren't":"are not","can't":"cannot","'cause":"because","could've":"could have","couldn't":"could not","didn't":"did not","doesn't":"does not","don't":"do not","hadn't":"had not","hasn't":"has not","haven't":"have not","he'd":"he would","he'll":"he will","he's":"he is","how'd":"how did","how'd'y":"how do you","how'll":"how will","how's":"how is", "I'd":"I would","I'd've":"I would have","I'll":"I will","I'll've":"I will have","I'm":"I am","I've":"I have","i'd":"i would", "i'd've":"i would have","i'll":"i will", "i'll've":"i will have","i'm":"i am","i've":"i have","isn't":"is not","it'd":"it would", "it'd've":"it would have","it'll":"it will","it'll've":"it will have","it's":"it is","let's":"let us","ma'am":"madam", "mayn't":"may not","might've":"might have","mightn't":"might not","mightn't've":"might not have","must've":"must have", "mustn't":"must not","mustn't've":"must not have","needn't":"need not","needn't've":"need not have","o'clock":"of the clock", "oughtn't":"ought not","oughtn't've":"ought not have","shan't":"shall not","sha'n't":"shall not","shan't've":"shall not have", "she'd":"she would","she'd've":"she would have","she'll":"she will","she'll've":"she will have","she's":"she is", "should've":"should have","shouldn't":"should not","shouldn't've":"should not have","so've":"so have","so's":"so as", "this's":"this is","that'd":"that would","that'd've":"that would have","that's":"that is","there'd":"there would", "there'd've":"there would have","there's":"there is","here's":"here is","they'd":"they would","they'd've":"they would have", "they'll":"they will","they'll've":"they will have","they're":"they are","they've":"they have","to've":"to have", "wasn't":"was not","we'd":"we would","we'd've":"we would have","we'll":"we will","we'll've":"we will have","we're":"we are", "we've":"we have","weren't":"were not","what'll":"what will","what'll've":"what will have","what're":"what are", "what's":"what is","what've":"what have","when's":"when is","when've":"when have","where'd":"where did","where's":"where is", "where've":"where have","who'll":"who will","who'll've":"who will have","who's":"who is","who've":"who have", "why's":"why is","why've":"why have","will've":"will have","won't":"will not","won't've":"will not have", "would've":"would have","wouldn't":"would not","wouldn't've":"would not have","y'all":"you all", "y'all'd":"you all would","y'all'd've":"you all would have","y'all're":"you all are","y'all've":"you all have", "you'd":"you would","you'd've":"you would have","you'll":"you will","you'll've":"you will have", "you're":"you are","you've":"you have"
    }


    def __init__(self):
        nltk.download('stopwords')
        self.stop_words = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.clean_functions = [
            self._clean_from_html,
            self._clean_from_contraction_mapping,
            self._clean_from_punctuation,
            self._clean_numbers,
            self._clean_less_characters,
            self._remove_stop_words,
            self._stem_words,
        ]
        return super().__init__()

    def clean(self, data, category):
        data = self._remove_category_label(data, category).lower()
        for func in self.clean_functions:
            data = func(data)
        return data
    
    def _remove_category_label(self, data, category):
        return data.replace("review/%s: "%category, "")

    def _clean_from_html(self, data):
        return re.sub(re.compile('<.*?>'),'',data)

    def _clean_from_contraction_mapping(self, data):
        return self._do_on_word(data, map, lambda word: self.contraction_mapping.get(word, word))

    def _clean_from_punctuation(self, data):
        return re.sub(r'[?|!|$|#|\'|"|:|,|(|)|.|\|/]',r'',data)

    def _do_on_word(self, data, action, func):
        return " ".join(list(action(func, data.split(" "))))

    def _clean_numbers(self, data):
        return self._do_on_word(data, filter, lambda word: word.isalpha())

    def _clean_less_characters(self, data):
        return self._do_on_word(data, filter, lambda word: len(word) > 2)

    def _remove_stop_words(self, data):
        return self._do_on_word(data, filter, lambda word: word not in self.stop_words)

    def _stem_words(self, data):
        return self._do_on_word(data, map, lambda word: self.stemmer.stem(word))


In [0]:
class WordEmbeder(object):
    
    def __init__(self, dataset, epochs):
        self.epochs = epochs
        nltk.download('punkt')
        self._make_model(dataset)
        
        return super().__init__()

    def _get_all_words(self, dataset):
        all_sentences = []
        for category_collection in list(dataset.values()):
            all_sentences += category_collection
        all_words = [sent.split(" ") for sent in all_sentences]
        return all_words

    def _make_model(self, dataset):
        corpus = self._get_all_words(dataset)
        model = Word2Vec(corpus, min_count=1)
        model.train(corpus, total_examples=len(corpus),epochs=self.epochs)
        self.model = model

    def replace_word_with_vector(self, dataset):
        dataset_vec = {}
        for category in list(dataset.keys()):
            dataset_vec[category] = []
            for sentence in dataset[category]:
                dataset_vec[category].append([])
                for word in sentence.split(" "):
                    dataset_vec[category][-1].append(self.model.wv.word_vec(word))
        return dataset_vec
    
    def replace_vector_with_word(self, sentence):
      answer = []
      for word in sentence:
        if(word.all() != 0):
          new_word = self.model.wv.most_similar([word], topn=1)[0][0]
          answer.append(new_word)
      return ' '.join(answer)

In [0]:
class DataPreparer(object):

    def __init__(self, max_words, train_test_ratio):
        self.train_test_ratio = train_test_ratio
        self.max_words = max_words
        return super().__init__()

    def _pad_sequence(self, dataset):
        for category in list(dataset.keys()):
            dataset[category] = pad_sequences(dataset[category], maxlen=self.max_words[category], dtype=np.float32)
        return dataset

    def _split_data(self, dataset):
        return train_test_split(
            dataset[categories[0]],
            dataset[categories[1]],
            test_size=self.train_test_ratio)

    def prepare(self, dataset):
      return self._split_data(self._pad_sequence(dataset))

In [0]:
class LstmModel(object):
  
  lstm_size = 128
  dropout = .2
  batch_size = 32
  
  def __init__(self, has_attention=False):
    self.has_attention = has_attention
    return super().__init__()
  
  
  def make_model(
      self,
      has_summary,
      word_len, 
      max_words_summary,
      x_train,
      y_train,
      x_test,
      y_test,
      to_predict,
      epochs
  ):
    encoder_inputs = Input(x_train[0].shape)
    
    encoder_lstm = Bidirectional(LSTM(
        units = self.lstm_size,
        dropout = self.dropout,
        activation="tanh",
        return_state=True,
    ))

    encoder_outputs, h1, c1, h2, c2 = encoder_lstm(encoder_inputs)

    decoder_inputs = Input(shape=(None, y_train[0].shape[1]))

    decoder_lstm = LSTM(
        units = self.lstm_size,
        dropout = self.dropout,
        activation="softmax",
        return_sequences=True,
        return_state=True,
    )

    decoder_initial_state = [Add()([h1, h2]), Add()([c1, c2])]

    decoder_outputs, h1, c1 = decoder_lstm(
        decoder_inputs,
        decoder_initial_state
    )
    
    if(self.has_attention):
      attention = TimeDistributed(Dense(1, activation='tanh'))
      decoder_outputs = attention(decoder_outputs)

    dense = Dense(
        y_train[0].shape[1],
        activation='linear',
    )

    outputs = dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], outputs)

    if(has_summary):
      model.summary()
      
    model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
    
    model.fit(
        x=[x_train,y_train],
        y=y_train,
        batch_size=self.batch_size,
        epochs=epochs,
        validation_data=([x_test,y_test], y_test)
    )
    
    scores = model.evaluate([x_test,y_test],y_test, verbose=0)
  
    encoder_model = Model(
        encoder_inputs,
        decoder_initial_state
    )

    decoder_state = [
        Input((self.lstm_size,)),
        Input((self.lstm_size,)),
    ]

    decoder_out, h1, c1 = decoder_lstm(
        decoder_inputs,
        initial_state = decoder_state
    )

    if(self.has_attention):
      attention = TimeDistributed(Dense(1, activation='tanh'))
      decoder_out = attention(decoder_out)

    
    output = dense(decoder_out)

    decoder_model = Model(
        [decoder_inputs] + decoder_state, [output, h1, c1]
    )


    h1, c1 = encoder_model.predict(x_train)
    out = np.zeros((len(x_train),1, word_len))
    output = []
    for i in range(max_words_summary):
        out, h1, c1 = decoder_model.predict([out, h1, c1])
        output.append(out)

    prediction = np.concatenate(output, axis=1)
    return prediction

In [0]:
# parameters
filename = 'finefoods.txt'
categories = ["text", "summary"]
max_words = {'text':200, "summary": 10}
train_test_ratio = .2
word2vec_train_epochs = 10
model_train_epochs = 5
word_len = 100
max_data_count = 10000

In [58]:
# read & clean
data_preproccessor = DataPreproccessor()
data_fetcher = DataFetcher(data_preproccessor, max_data_count)
dataset = data_fetcher.fetch(os.path.join(root_path, filename), categories)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
# word2vec embedding
word_embedding_model = WordEmbeder(dataset, word2vec_train_epochs)
dataset_vecorized = word_embedding_model.replace_word_with_vector(dataset)
vocabulary = word_embedding_model.model.wv.syn0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


W0719 12:57:42.030803 139621052536704 base_any2vec.py:1182] Effective 'alpha' higher than previous training cycles
  This is separate from the ipykernel package so we can avoid doing imports until


In [0]:
# pad & split
data_preparer = DataPreparer(max_words,train_test_ratio)
x_train, x_test, y_train, y_test = data_preparer.prepare(dataset_vecorized)

In [61]:
# lstm model
lstm_model = LstmModel(has_attention=True)
prediction = lstm_model.make_model(
    has_summary = True,
    word_len = word_len, 
    max_words_summary = max_words['summary'],
    x_train = x_train,
    y_train = y_train,
    x_test = x_test,
    y_test = y_test,
    to_predict = y_test,
    epochs = model_train_epochs
)


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            (None, 200, 100)     0                                            
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) [(None, 256), (None, 234496      input_7[0][0]                    
__________________________________________________________________________________________________
input_8 (InputLayer)            (None, None, 100)    0                                            
__________________________________________________________________________________________________
add_7 (Add)                     (None, 128)          0           bidirectional_4[0][1]            
                                                                 bidirectional_4[0][3]            
__________

In [62]:
# check the prediction
print("actual text: " + word_embedding_model.replace_vector_with_word(x_test[0]))
print("actual summary: " + word_embedding_model.replace_vector_with_word(y_test[0]))
print("predicted summary: " + word_embedding_model.replace_vector_with_word(prediction[0]))


actual text: anyon see these made avoid treat china problem late see countri origin
actual summary: china
predicted summary: alot alot alot alot alot alot alot alot alot alot


  if np.issubdtype(vec.dtype, np.int):
