In [3]:
import pandas as pd
import numpy as np
import re
import warnings
import unidecode
from keras.preprocessing.sequence import pad_sequences # for padding
from keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed
from keras.models import Model, load_model
from keras.callbacks import EarlyStopping, ModelCheckpoint # to prevent overfitting and save the best model

import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option("display.max_colwidth", 200)
warnings.filterwarnings("ignore")

In [2]:
!pip install unidecode

Collecting unidecode
  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
Installing collected packages: unidecode
Successfully installed unidecode-1.1.1


In [4]:
# function to read the text file
def read_text(filename):
        # open the file
        file = open(filename, mode='rt', encoding='utf-8')
        # read all text
        text = file.read()
        file.close()
        return text

In [5]:
# split a text into sentences
def to_lines(text):
      sents = text.strip().split('\n')
      sents = [i.split('\t') for i in sents]
      return sents

In [6]:
# read the data
txt = read_text("spa.txt")

# convert text into list of 
sp_eng = to_lines(txt)

In [7]:
len(sp_eng)

124325

In [8]:
type(sp_eng)

list

In [10]:
sp_eng[0:1]

[['Go.',
  'Ve.',
  'CC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #4986655 (cueyayotl)']]

In [11]:
# converting list into a dataframe

# convert into array
sp_eng = np.array(sp_eng)

# empty lists
eng_l = []
sp_l = []

# populate the lists with sentence lengths
for i in sp_eng[:,0]:
      eng_l.append(i)

for i in sp_eng[:,1]:
      sp_l.append(i)
        
data = pd.DataFrame({'spa':sp_l, 'eng':eng_l})

In [12]:
data.tail()

Unnamed: 0,spa,eng
124320,"Hay cuatro causas principales de muertes relacionadas con el alcohol. Lesión por un accidente automovilístico o violencia es una. Enfermedades como cirrosis del hígado, cáncer, enfermedades del co...","There are four main causes of alcohol-related death. Injury from car accidents or violence is one. Diseases like cirrhosis of the liver, cancer, heart and blood system diseases are the others."
124321,"Hay madres y padres que se quedan despiertos después de que sus hijos se hayan dormido y se preguntan cómo conseguir pagar la hipoteca o las facturas del médico, o cómo ahorrar el suficiente diner...","There are mothers and fathers who will lie awake after the children fall asleep and wonder how they'll make the mortgage, or pay their doctor's bills, or save enough for their child's college educ..."
124322,Una huella de carbono es la cantidad de contaminación de dióxido de carbono que producimos como producto de nuestras actividades. Algunas personas intentan reducir su huella de carbono porque está...,A carbon footprint is the amount of carbon dioxide pollution that we produce as a result of our activities. Some people try to reduce their carbon footprint because they are concerned about climat...
124323,"Como suele haber varias páginas web sobre cualquier tema, normalmente sólo le doy al botón de retroceso cuando entro en una página web que tiene anuncios en ventanas emergentes. Simplemente voy a ...","Since there are usually multiple websites on any given topic, I usually just click the back button when I arrive on any webpage that has pop-up advertising. I just go to the next page found by Goo..."
124324,"Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra ...","If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until t..."


In [13]:
#dropping duplicates from the Spanish sentences
data.drop_duplicates(subset=['spa'],inplace=True) 

## Text Preprocessing

In [14]:
# function to preprocess the text
def cleaner(text):
    newString = text.lower()
    unaccented_string = unidecode.unidecode(newString)
    newString = re.sub("'",'', unaccented_string) 
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    tokens = newString.split()
    return (" ".join(tokens)).strip()

In [15]:
# preprocess english text
cleaned_eng = []
for t in data['eng']:
    cleaned_eng.append(cleaner(t)) 
    
# preprocess Spanish text
cleaned_spa = []
for t in data['spa']:
    cleaned_spa.append(cleaner(t)) 

    
data['cleaned_eng']=cleaned_eng
data['cleaned_spa']=cleaned_spa

In [16]:
data.tail()

Unnamed: 0,spa,eng,cleaned_eng,cleaned_spa
124320,"Hay cuatro causas principales de muertes relacionadas con el alcohol. Lesión por un accidente automovilístico o violencia es una. Enfermedades como cirrosis del hígado, cáncer, enfermedades del co...","There are four main causes of alcohol-related death. Injury from car accidents or violence is one. Diseases like cirrhosis of the liver, cancer, heart and blood system diseases are the others.",there are four main causes of alcohol related death injury from car accidents or violence is one diseases like cirrhosis of the liver cancer heart and blood system diseases are the others,hay cuatro causas principales de muertes relacionadas con el alcohol lesion por un accidente automovilistico o violencia es una enfermedades como cirrosis del higado cancer enfermedades del corazo...
124321,"Hay madres y padres que se quedan despiertos después de que sus hijos se hayan dormido y se preguntan cómo conseguir pagar la hipoteca o las facturas del médico, o cómo ahorrar el suficiente diner...","There are mothers and fathers who will lie awake after the children fall asleep and wonder how they'll make the mortgage, or pay their doctor's bills, or save enough for their child's college educ...",there are mothers and fathers who will lie awake after the children fall asleep and wonder how theyll make the mortgage or pay their doctors bills or save enough for their childs college education,hay madres y padres que se quedan despiertos despues de que sus hijos se hayan dormido y se preguntan como conseguir pagar la hipoteca o las facturas del medico o como ahorrar el suficiente dinero...
124322,Una huella de carbono es la cantidad de contaminación de dióxido de carbono que producimos como producto de nuestras actividades. Algunas personas intentan reducir su huella de carbono porque está...,A carbon footprint is the amount of carbon dioxide pollution that we produce as a result of our activities. Some people try to reduce their carbon footprint because they are concerned about climat...,a carbon footprint is the amount of carbon dioxide pollution that we produce as a result of our activities some people try to reduce their carbon footprint because they are concerned about climate...,una huella de carbono es la cantidad de contaminacion de dioxido de carbono que producimos como producto de nuestras actividades algunas personas intentan reducir su huella de carbono porque estan...
124323,"Como suele haber varias páginas web sobre cualquier tema, normalmente sólo le doy al botón de retroceso cuando entro en una página web que tiene anuncios en ventanas emergentes. Simplemente voy a ...","Since there are usually multiple websites on any given topic, I usually just click the back button when I arrive on any webpage that has pop-up advertising. I just go to the next page found by Goo...",since there are usually multiple websites on any given topic i usually just click the back button when i arrive on any webpage that has pop up advertising i just go to the next page found by googl...,como suele haber varias paginas web sobre cualquier tema normalmente solo le doy al boton de retroceso cuando entro en una pagina web que tiene anuncios en ventanas emergentes simplemente voy a la...
124324,"Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra ...","If you want to sound like a native speaker, you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until t...",if you want to sound like a native speaker you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until th...,si quieres sonar como un hablante nativo debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra v...


## Split Data into Training and Validation Set¶

In [17]:
# train-test split
from sklearn.model_selection import train_test_split
x_tr, x_val, y_tr, y_val = train_test_split(data['cleaned_spa'],data['cleaned_eng'], test_size = 0.2, random_state=0,shuffle=True)

Saving validation data for later use

In [18]:
# storing validation data into new variables for later use
x_val_original = x_val
y_val_original = y_val

# reset index
y_val_original.reset_index(inplace=True, drop=True)
x_val_original.reset_index(inplace=True, drop=True)

## Encode Text Sequences

In [19]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

# create word-frequency pair dictionary
source_word_freq = build_vocab(x_tr)

In [20]:
source_word_freq

{'ojala': 136,
 'lo': 5474,
 'hubiera': 151,
 'sabido': 33,
 'yo': 2255,
 'hago': 68,
 'que': 21160,
 'puedo': 1423,
 'por': 5766,
 'mostrarme': 14,
 'atento': 13,
 'tom': 15859,
 'a': 16435,
 'menudo': 205,
 'va': 656,
 'solo': 1201,
 'de': 18522,
 'compras': 58,
 'se': 7147,
 'canso': 8,
 'tener': 311,
 'pagar': 115,
 'siempre': 733,
 'salia': 9,
 'con': 3934,
 'mary': 3394,
 'tanto': 280,
 'como': 2317,
 'trabajan': 18,
 'modelos': 3,
 'demasiado': 599,
 'tarde': 635,
 'ella': 3450,
 'le': 3610,
 'da': 170,
 'mismo': 427,
 'vivir': 221,
 'en': 10134,
 'una': 5086,
 'ciudad': 294,
 'o': 548,
 'el': 17813,
 'campo': 60,
 'es': 9686,
 'imposible': 75,
 'funcione': 8,
 'las': 2940,
 'chicas': 45,
 'estan': 793,
 'jugando': 89,
 'al': 2623,
 'voley': 3,
 'playa': 93,
 'tomas': 831,
 'mintio': 22,
 'los': 4015,
 'demas': 55,
 'no': 16388,
 'quiero': 1687,
 'hablar': 696,
 'libro': 684,
 'esta': 5740,
 'aqui': 1595,
 'sabes': 364,
 'frances': 599,
 'esto': 1355,
 'ayuda': 350,
 'mucho': 10

Find proportion of tokens occurring less than a threshold values

In [21]:
# set threshold value for rare words
thresh=2

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in source_word_freq.items():
  tot_cnt=tot_cnt+1
  tot_freq=tot_freq+value
  if(value<thresh):
    cnt=cnt+1
    freq=freq+value
    
print("% of rare words in vocabulary:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

% of rare words in vocabulary: 42.57632425850222
Total Coverage of rare words: 1.6915094518921925


Nice Explanation is provided why the threshold was set in video, and notes is avialable

We have only 1.69% with rare words. So we can safely replace with 'unknown' token.

Build a word-index pair dictionary

In [22]:
# assign index, starting from 2
source_word_index={}
cnt=2
for key,value in source_word_freq.items():
  # add token if it is not rare  
  if(value>=thresh):
    source_word_index[key]=cnt  
    cnt=cnt+1

# assign index to "padding" and "unknown" tokens
source_word_index['<pad>']=0
source_word_index['<unk>']=1

Integer with 0 and 1 are to assinged with Padding and Unknown Tokens repectively.

Now we can use this word-index dictionary to represent the spanish sentences in the training data as sequences of integers. 

### Create integer-sequences from the spanish sentences.

In [23]:
source_vocab=[key for key,value in source_word_index.items()]  #spanish vocabulary

# sentences to integer sequences (Spanish sentences - training data)
source_seq_tr=[]
for i in x_tr:
  seq=[]
  for j in i.split():
    if(j not in source_vocab):
      seq.append(source_word_index['<unk>'])
    elif(source_word_freq[j]<thresh):
      seq.append(source_word_index['<unk>'])
    else:
      seq.append(source_word_index[j])
  source_seq_tr.append(seq)

In [24]:
source_seq_tr[:10]

[[2, 3, 4, 5, 6],
 [7, 3, 8, 9, 10, 11, 12],
 [13, 14, 15, 16, 17, 18, 19],
 [13, 20, 21, 18, 22, 8, 23, 24, 8, 25, 26, 27],
 [28, 13, 29, 27, 30, 18, 31],
 [32, 33],
 [14, 34, 35, 36, 3, 37, 38, 39, 40, 41, 42, 39, 43, 44],
 [45, 46, 8, 47],
 [48, 49, 50, 51, 52, 53, 54],
 [55, 35, 56, 14, 57, 58]]

Now, Spanish sentences have been converted into sequence of numbers. These sequences can be used to train our model.

Similary we create sequence for spanish sentences in validation dataset.

In [25]:
# sentences to integer sequences (Spanish sentences - validation data)
source_seq_val=[]
for i in x_val:
  seq=[]
  for j in i.split():
    if(j not in source_vocab):
      seq.append(source_word_index['<unk>'])
    elif(source_word_freq[j]<thresh):
      seq.append(source_word_index['<unk>'])
    else:
      seq.append(source_word_index[j])
  source_seq_val.append(seq)

In [26]:
# english word-frequency dictionary
target_word_freq = build_vocab(y_tr)

In [27]:
thresh=2 # set threshold count 

cnt=0
tot_cnt=0
freq=0
tot_freq=0

for key,value in target_word_freq.items():
  tot_cnt=tot_cnt+1
  tot_freq=tot_freq+value
  if(value<thresh):
    cnt=cnt+1
    freq=freq+value
    
print("Vocabulary of rare words:",(cnt/tot_cnt)*100)
print("Total Coverage of rare words:",(freq/tot_freq)*100)

Vocabulary of rare words: 33.82568957075241
Total Coverage of rare words: 0.6924074061700265


Build a dictionary that assigns index to every word in the vocabulary by removing the rare words

In [28]:
target_word_index={}

cnt=4 # start assigning index from 4 
for key,value in target_word_freq.items():
  if(value>=thresh):
    target_word_index[key]=cnt  
    cnt=cnt+1

# Assign index to "padding" and "unknown" token
target_word_index['<pad>']=0
target_word_index['<unk>']=1

26.00

After Completing Language Translation, we have Text Summarization concept nicely explained with same deep learning architecture.