In [27]:
# 75.06 - 1C202 - TP2 
# Competencia de Machine learning
#
# https://www.kaggle.com/c/nlp-getting-started/overview
#
# Fuentes:
# https://realpython.com/python-keras-text-classification/
# https://towardsdatascience.com/data-augmentation-in-nlp-2801a34dfc28

# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Model, Sequential, save_model, load_model
from keras.layers import Embedding, Conv1D, Dropout, Input, GlobalMaxPooling1D, Dense, concatenate, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Importación para MLP.
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Input

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import MinMaxScaler
import glob
#import cv2
import os


#Configuracion general
plt.style.use('ggplot')
warnings.filterwarnings('ignore')

In [28]:
def procesar_categoricas_numericas(df, train, test):
    # Definimos las columnas continuas.
    continuous = ["length", "totalwords"]
    # Normalizamos las columnas a un rango de [0, 1]
    cs = MinMaxScaler()
    trainContinuous = cs.fit_transform(train[continuous])
    testContinuous = cs.transform(test[continuous])
    # Aplicamos un one-hot encoding de las columans para dejarlas en un rango [0, 1])
    zipBinarizer = LabelBinarizer().fit(df["keyword"])
    trainKeywordCategorical = zipBinarizer.transform(train["keyword"])
    testKeywordCategorical = zipBinarizer.transform(test["keyword"])
    # Concatenamos las columnas categóricas y las numéricas a usar.
    trainX = np.hstack([trainKeywordCategorical, trainContinuous])
    testX = np.hstack([testKeywordCategorical, testContinuous])
    # retornamos nuestro set de datos de entrenamiento y prueba.
    return (trainX, testX)

In [29]:
def create_mlp(dim, regress=False):
    # Defeninimos la red perceptron.
    model = Sequential()
    model.add(Dense(8, input_dim=dim, activation="relu"))
    model.add(Dense(4, activation="relu"))
    if regress:
        model.add(Dense(1, activation="linear"))
    # retornamos el modelo.
    return model

In [30]:
# Generado train: 'data/processed/train.2020.08.03T16.57.04.289197.csv' - (7613) registros.
# Generado test:  'data/processed/test.2020.08.03T16.57.04.289197.csv' - (3263) registros.

In [31]:
# Path de entrada    
train_path = 'data/processed/train.2020.08.03T16.57.04.289197.csv' #Cambiar por el archivo pre-procesado a usar!
test_path = 'data/processed/test.2020.08.03T16.57.04.289197.csv' #Cambiar por el archivo pre-procesado a usar!

# Configuracion
maxlen = 100

# Carga de los archivos del set de datos
# Set de train: carga
df_twitter_train = pd.read_csv(train_path, sep=',')
# Set de test: carga
df_twitter_test = pd.read_csv(test_path, sep=',')

# Print de los shapes
print('Shape train: ' + str(df_twitter_train.shape))
print('Shape test: ' + str(df_twitter_test.shape))

Shape train: (7613, 7)
Shape test: (3263, 6)


In [32]:
df_twitter_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          7613 non-null   int64 
 1   keyword     7613 non-null   object
 2   location    7613 non-null   object
 3   text        7613 non-null   object
 4   target      7613 non-null   int64 
 5   length      7613 non-null   int64 
 6   totalwords  7613 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 416.5+ KB


In [33]:
# Nos quedamos con X e y para entrenar
tweets = df_twitter_train['text'].values
y = df_twitter_train['target'].values

# Separamos X para el set de test
tweets_predict = df_twitter_test['text'].values

# Tokenizamos los textos
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweets)

# Convertimos a secuencias
X = tokenizer.texts_to_sequences(tweets)
X_predict = tokenizer.texts_to_sequences(tweets_predict)

# Paddeamos a maxlen
X = pad_sequences(X, padding='post', maxlen=maxlen)
X_predict = pad_sequences(X_predict, padding='post', maxlen=maxlen)

In [34]:
# Path de salida para el submission
submission_path = 'data/submits/submission.' + datetime.datetime.now().isoformat() + '.csv'
submission_path = submission_path.replace('-','.').replace(':','.')

# Cargamos el modelo con el mejor puntaje
filepath = 'models.backup.81/' + 'TP2.Keras.Conv1D.Iter.2' + '.h5'
loaded_model = load_model(filepath, custom_objects=None, compile=True)

# Mostramos el detalle del modelo
loaded_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     4953900     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 99, 100)      60100       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 98, 100)      90100       embedding[0][0]                  
____________________________________________________________________________________________

In [35]:
(trainAttrX, testAttrX) = train_test_split(df_twitter_train, test_size=0.25, random_state=42)
(trainAttrX, testAttrX) = procesar_categoricas_numericas(df_twitter_train,trainAttrX, testAttrX)

In [36]:
mlp = create_mlp(trainAttrX.shape[1], regress=False)

In [37]:
# create the input to our final set of layers as the *output* of boththe MLP and CNN
combinedInput = concatenate([mlp.output, loaded_model.output])

# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(4, activation="relu")(combinedInput)
x = Dense(1, activation="linear")(x)

In [38]:
# our final model will accept categorical/numerical data on the MLP
# input and images on the CNN input, outputting a single value (the
# predicted price of the house)
model = Model(inputs=[mlp.input, loaded_model.input], outputs=x)

ValueError: The name "concatenate" is used 2 times in the model. All layer names should be unique.

In [4]:
# Prediccion en base al modelo entrenado
y_predict = (loaded_model.predict(X_predict) > 0.5).astype('int32')

# Generacion del dataframe que generara el submit para la competencia
kaggle_submission = pd.DataFrame(df_twitter_test,columns = ['id'])
kaggle_submission['target'] = y_predict

kaggle_submission.to_csv(submission_path, index=False)

# Imprimimos un resumen de la operacion
print('Operación finalizada!\n')
print('Generado submit: \'' + submission_path + '\' - (' + str(len(kaggle_submission['target'].index)) + ') registros.')

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 100)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     4953900     input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 99, 100)      60100       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 98, 100)      90100       embedding[0][0]                  
____________________________________________________________________________________________