In [None]:
!pip3 install tensorflow_text>=2.0.0rc0

In [None]:
import numpy as np
import pandas as pd
import re
import tqdm.notebook as tq
import string

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text

from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
def clean(text):
    text = re.sub(r'\t', ' ', text) # eliminamos tabulation
    text = re.sub(r'\n', ' ', text) # eliminamos nueva linea
    text = re.sub(r"\s+", " ", text) # eliminamos extra espacios
    text = text.strip() # eliminamos espacios al principio y al fin
    text = re.sub(r"[^a-zA-Z\'\.\,\d\s]", " ", text) # eliminamos caracter especiales
    text = re.sub(r"http\S+", " ", text) # eliminamos links
    text = re.sub(r"[0-9]", " ", text) # eliminamos numeros
    
    return text

In [None]:
train.text = train.text.apply(clean)
test.text = test.text.apply(clean)

In [None]:
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [None]:
embed('hello world')

<tf.Tensor: shape=(1, 512), dtype=float32, numpy=
array([[ 0.05679859,  0.00039966, -0.04721997, -0.00237391, -0.00133101,
        -0.07340868,  0.03015386, -0.01155163, -0.04641062, -0.02860397,
         0.00596971, -0.06006122,  0.0050675 ,  0.02867028,  0.00816475,
         0.00257079,  0.01661796,  0.05151922,  0.06586424, -0.0104136 ,
         0.02593341,  0.00465988,  0.07797637,  0.01581948,  0.04321634,
        -0.00211051,  0.08996548,  0.00092777, -0.03003569,  0.00322175,
        -0.00294578, -0.03005555, -0.03486117, -0.07770569,  0.03615946,
        -0.054484  , -0.0265274 ,  0.00016399,  0.02643614, -0.06270367,
         0.02216633,  0.00972736, -0.03037143,  0.06702992,  0.05930759,
         0.04259753,  0.03830501,  0.08515137,  0.01702011,  0.00709119,
         0.01693203, -0.00574082, -0.02684974, -0.02125344, -0.00611558,
        -0.05458139, -0.06069318,  0.03294551, -0.01941219, -0.02750246,
         0.03343957, -0.09708545, -0.05304239,  0.00502583,  0.04201486,
 

In [None]:
X_train = []
for tweet in tq.tqdm(train.text.values): # progress bar
  emb = embed(tweet)
  emb = tf.reshape(emb, [-1]).numpy() # guardamos una solo dimensión y convertimos a numpy array
  X_train.append(emb)

X_train = np.array(X_train)
y_train = train.target.values

X_test = []
for tweet in tq.tqdm(test.text.values):
  emb = embed(tweet)
  emb = tf.reshape(emb, [-1]).numpy()
  X_test.append(emb)

X_test = np.array(X_test)

HBox(children=(FloatProgress(value=0.0, max=7613.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3263.0), HTML(value='')))




In [None]:
train_arrays, test_arrays, train_targets, test_targets = train_test_split(X_train,y_train,test_size=0.1)

# SVM con GridSearch

In [None]:
def select_parameters(X, y):
    parameters = {'C': [0.5, 1, 2],  
                  'gamma': [1, 2, 3], 
                  'kernel': ['rbf']}
    grid_search = GridSearchCV(SVC(), parameters)
    grid_search.fit(X, y)
    return grid_search

model = select_parameters(X_train,y_train)

In [None]:
model.best_params_

{'C': 0.5, 'gamma': 1, 'kernel': 'rbf'}

In [None]:
accuracy_score(y_train,model.predict(X_train))

0.8795481413371864

In [None]:
accuracy_score(train_targets,model.predict(train_arrays)), accuracy_score(test_targets,model.predict(test_arrays))

(0.8643993577579915, 0.8162729658792651)

# Red Neuronal

In [None]:
vocab_size = 50000
# vocab_size = len(word_index)+1
embedding_dim = 64
max_length = 512
padding_type='post'
oov_tok = "<OOV>"

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
#     tf.keras.layers.LSTM(60, return_sequences=True,name='lstm_layer'),
    tf.keras.layers.Conv1D( filters=5 ,kernel_size=5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
#     tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])


model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer='adam',metrics=['accuracy'])
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 512, 64)           3200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 508, 5)            1605      
_________________________________________________________________
global_average_pooling1d_1 ( (None, 5)                 0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 100)               600       
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                

In [None]:
num_epochs = 3  # mayor num de epochs, mayor tiempo de train
batch_size = 25  # a mayor tamaño de batch, menor tiempo de entrenamiento
history = model.fit(train_arrays, train_targets, epochs=num_epochs, batch_size=batch_size, validation_data=(test_arrays, test_targets))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
loss, accuracy = model.evaluate(train_arrays, train_targets, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_arrays, test_targets, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.5672
Testing Accuracy:  0.5984


# Submit Kaggle

In [None]:
predict = pd.DataFrame(model.predict(X_test), columns=['target'])
test['target'] = predict['target']
results = test[['id', 'target']]
results.to_csv('SVM-USE.csv', index=False)

In [None]:
files.download('SVM-USE.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>