In [0]:
import tensorflow as tf

import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn.preprocessing import LabelEncoder

In [2]:
#PRINT VERSION!!
tf.__version__

'2.2.0'

In [0]:
filepath = ('https://media.githubusercontent.com/media/PacktPublishing/Advanced-NLP-Projects-with-TensorFlow-2.0/master/section_1_notebooks/train_languages.csv')

In [4]:
train_df = pd.read_csv(filepath)#here we have the dataset we extracted
train_df.head()

Unnamed: 0,sentence,language
0,"Jean Beauverie (Fontaines-sur-Saône, 18 febbra...",italian
1,Il pinguino saltarocce (Eudyptes chrysocome (F...,italian
2,Maison Ikkoku - Cara dolce Kyoko (めぞん一刻 Mezon ...,italian
3,La mia città è un singolo della cantante itali...,italian
4,L'Armata Rossa dei Lavoratori e dei Contadini ...,italian


In [5]:
len(train_df) #we print the length, not a big one but sufficient

3633

In [0]:
Y = train_df['language']
encoder = LabelEncoder()
encoder.fit(Y)
Y = encoder.transform(Y)
Y = tf.keras.utils.to_categorical(
    Y,
    num_classes=4                       #equals to the number of languages
    
)

In [0]:
train_df['sentence_lower'] = train_df["sentence"].str.lower()
train_df['sentence_no_punctuation'] = train_df['sentence_lower'].str.replace('[^\w\s]','')
train_df['sentence_no_punctuation'] = train_df["sentence_no_punctuation"].fillna("fillna")

In [8]:
print(train_df['sentence_no_punctuation'])

0       jean beauverie fontainessursaône 18 febbraio 1...
1       il pinguino saltarocce eudyptes chrysocome for...
2       maison ikkoku  cara dolce kyoko めぞん一刻 mezon ik...
3       la mia città è un singolo della cantante itali...
4       larmata rossa dei lavoratori e dei contadini i...
                              ...                        
3628    el premio internacional de novela emilio alarc...
3629    la mujer más fea del mundo es una película esp...
3630    bacuag también conocido como  bacnag es un mun...
3631    violent femmes es una banda de rock alternativ...
3632    james guthrie grenock escocia10 de junio de 18...
Name: sentence_no_punctuation, Length: 3633, dtype: object


In [0]:
max_features=5000                #we set maximum number of words to 5000
maxlen=400                       #we set maximum sequence length to 400

In [0]:
tok = tf.keras.preprocessing.text.Tokenizer(num_words=max_features)                 #again tokenizer step

In [0]:
tok.fit_on_texts(list(train_df['sentence_no_punctuation'])) #fit to cleaned text

In [12]:
print(len(tok.word_index))
vocab_size = len(tok.word_index) + 1 
#this represents the number of words that we tokenize different from max_features but necessary for
#the definition of the dimension of the embedding space

51630


In [0]:
train_df = tok.texts_to_sequences(list(train_df['sentence_no_punctuation']))                   #this is how we create sequences
train_df = tf.keras.preprocessing.sequence.pad_sequences(train_df, maxlen=maxlen)              #let's execute pad step

In [0]:
from sklearn.model_selection import train_test_split                                           #divide into train and test set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(train_df, Y, test_size=0.1, random_state=123)

In [0]:
embedding_dim = 50                                                                              #this is the final dimension of the embedding space.

In [0]:
model = tf.keras.models.Sequential([
  tf.keras.layers.Embedding(input_dim=vocab_size,            #embedding input
                           output_dim=embedding_dim,         #embedding output
                           input_length=maxlen),             #maximum length of an input sequence
  tf.keras.layers.Flatten(),                                 #flatten layer

  tf.keras.layers.Dense(4, activation=tf.nn.softmax)         #ouput layer a Dense layer with 4 probabilities
  #we also define our final activation function which is the softmax function typical for multiclass
  #classifiction problems

])

In [0]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',               #we recommend this loss function you
              metrics=['accuracy'])

In [19]:
model.summary()                                              #here we show the architecture

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 50)           2581550   
_________________________________________________________________
flatten (Flatten)            (None, 20000)             0         
_________________________________________________________________
dense (Dense)                (None, 4)                 80004     
Total params: 2,661,554
Trainable params: 2,661,554
Non-trainable params: 0
_________________________________________________________________


In [20]:
model.fit(np.array(X_train), np.array(y_train), epochs=10)    #let's fit the model

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7efd072886a0>

In [21]:
model.evaluate(np.array(X_test), np.array(y_test))



[0.011290659196674824, 0.9972527623176575]

In [0]:
from sklearn.metrics import confusion_matrix #we import this package from sklearn and output it
predictions = model.predict(X_test) #here we make predictions
cm = confusion_matrix(predictions.argmax(axis=1), y_test.argmax(axis=1))#we generate the confusion matrix

In [23]:
cm #well this is really perfect!

array([[ 65,   0,   0,   0],
       [  0, 101,   0,   0],
       [  1,   0,  96,   0],
       [  0,   0,   0, 101]])

In [24]:
#these are the codes for each language in order to evaluate properly
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]


In [0]:
#new_text = ["tensorflow is a great tool you can find a lot of tutorials from packt"]
#new_text = ["tensorflow est un excellent outil vous pouvez trouver beaucoup de tutoriels de packt"]
#new_text = ["tensorflow è un ottimo strumento puoi trovare molti tutorial di packt"]
new_text = ["tensorflow es una gran herramienta puedes encontrar muchos tutoriales de packt"]

In [0]:
test_text = tok.texts_to_sequences(new_text) #this is how we create sequences
test_text = tf.keras.preprocessing.sequence.pad_sequences(test_text, maxlen=maxlen) #let's execute pad step

In [27]:
np.set_printoptions(suppress=True)
predictions = model.predict(test_text)
print(predictions.argmax())
print(predictions) #spanish you can get confused with italian which makes sense since they are more similar languages

3
[[0.01087021 0.02892438 0.05692278 0.90328264]]


In [0]:
#!pip install wikipedia

In [0]:
import wikipedia

In [31]:
#language codes
#english: en
#italian: it
#french: fr
#spanish: es
new_wiki_text = []
wikipedia.set_lang('es')
for i in range(0, 5):
    print(i)
    random = wikipedia.random(1)
       
    try:
        new_wiki_text.append([wikipedia.page(random).summary])
    except wikipedia.exceptions.DisambiguationError as e:
        random = wikipedia.random(1)

0
1
2
3
4


In [32]:
new_wiki_text = pd.DataFrame(new_wiki_text)
new_wiki_text.columns = ['sentence']
new_wiki_text

Unnamed: 0,sentence
0,Penthesilea (título original en alemán; en esp...
1,El rascón de Wallace (Habroptila wallacii)[2]​...
2,El aeródromo Sitry (en inglés: Sitry Skiway) f...
3,Tocar el cielo es una película que se estrenó ...
4,Atrévete a soñar es la primera banda sonora de...


In [0]:
new_wiki_text['sentence_lower'] = new_wiki_text["sentence"].str.lower()
new_wiki_text['sentence_no_punctuation'] = new_wiki_text['sentence_lower'].str.replace('[^\w\s]','')
new_wiki_text['sentence_no_punctuation'] = new_wiki_text["sentence_no_punctuation"].fillna("fillna")

In [0]:
np.set_printoptions(suppress=True)
test_wiki_text = tok.texts_to_sequences(list(new_wiki_text['sentence_no_punctuation'] )) #this is how we create sequences
test_wiki_text = tf.keras.preprocessing.sequence.pad_sequences(test_wiki_text, maxlen=maxlen) #let's execute pad step

In [35]:
predictions = model.predict(test_wiki_text)
print(predictions)

[[0.00000005 0.00000296 0.0000001  0.9999969 ]
 [0.00000008 0.00000344 0.00000005 0.9999964 ]
 [0.00000033 0.00000094 0.00000127 0.9999975 ]
 [0.00000001 0.00000013 0.00000004 0.99999976]
 [0.00000731 0.00016868 0.00001349 0.9998105 ]]


In [36]:
print('english', encoder.transform(['english']))
print('french', encoder.transform(['french']))
print('italian', encoder.transform(['italian']))
print('spanish', encoder.transform(['spanish']))

english [0]
french [1]
italian [2]
spanish [3]
