In [1]:
#!pip install --ignore-installed tensorflow-gpu --user

In [6]:
#librerías, no es necesario volverlas a importar
import pandas as pd

from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers, Sequential

#En Tensorflow 2.0 ya viene Keras, para usarlo hay que hacer tf.keras, lo demás es igual que antes

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
  except RuntimeError as e:
    print(e)


%run ../0_Data/Util.ipynb

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\germa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 200 # max number of words in a question to use

In [5]:
df_train = pd.read_csv('../dataset/train.csv',index_col='id', encoding='utf8')

df_test = pd.read_csv('../dataset/test.csv',index_col='id', encoding='utf8')

x_train_original = df_train['text']
x_test_original = df_test['text']
train_y  = df_train['target'].values




## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train_original))
train_X = tokenizer.texts_to_sequences(x_train_original)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)

model = Sequential()
model.add(layers.Embedding(input_dim=max_features, 
                           output_dim=embed_size, 
                           input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

model.fit(train_X, train_y,
                    epochs=6,
                    batch_size=512)


#model.fit(train_X, train_y, batch_size=512, epochs=2)
loss, accuracy = model.evaluate(train_X, train_y, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 200, 300)          15000000  
_________________________________________________________________
flatten_1 (Flatten)          (None, 60000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 10)                600010    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 15,600,021
Trainable params: 15,600,021
Non-trainable params: 0
_________________________________________________________________
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Training Accuracy: 0.5703


In [None]:
test_X = tokenizer.texts_to_sequences(x_test_original)

## Pad the sentences 
test_X = pad_sequences(test_X, maxlen=maxlen)


#y_test = model.predict(test_X, batch_size=1024, verbose=1)
y_test = model.predict_classes(test_X, batch_size=1024, verbose=1)

In [None]:
pred_y = model.predict_classes(train_X, batch_size=1024, verbose=1)

In [None]:
print("Report: \n{}".format(classification_report(train_y, pred_y)))

In [None]:
pred_y

In [None]:
submission = pd.read_csv('../dataset/sample_submission.csv')

In [None]:
len(y_test)

In [None]:
submission['target'] = y_test

In [None]:
submission

In [None]:
submission.to_csv("tensor_embeddings.csv", index=False)