In [37]:
import warnings
# Importacion de librerias necesarias
import re, string, random, datetime
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
import tensorflow as tf

# Importacion de librerias necesarias
import re, string, random, datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import f1_score

# Split y K-Fold 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

# Importacion de Keras
# https://keras.io
from keras.models import Model, Sequential, save_model, load_model
from keras import layers
from keras.layers import Embedding, Conv1D, Dropout, Input, GlobalMaxPooling1D, Dense, concatenate, Activation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [38]:
# Limpia los datos, mediante el uso de expresiones regulares
def f_remove_noise(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F]+','', text)
    return text

In [39]:
from spellchecker import SpellChecker

In [40]:
spell = SpellChecker()
def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
def remove_punct(text):
    table=str.maketrans('','',string.punctuation)
    return text.translate(table)

# Reference : https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [41]:
# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

In [42]:
# Carga de los archivos del set de datos
# Set de train: carga
df_twitter_train = pd.read_csv(train_path, sep=',')
# Set de test: carga
df_twitter_test = pd.read_csv(test_path, sep=',')

# Print de los shapes
print('Shape train: ' + str(df_twitter_train.shape))
print('Shape test: ' + str(df_twitter_test.shape))

Shape train: (7613, 5)
Shape test: (3263, 4)


In [None]:
df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_remove_noise(x))
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_remove_noise(x))

df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: correct_spellings(x))
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: correct_spellings(x))

df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: remove_punct(x))
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: remove_punct(x))

df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: remove_emoji(x))
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: remove_emoji(x))

In [None]:
keyword_stats = df_twitter_train.groupby('keyword').agg({'text':np.size, 'target':np.mean}).rename(columns={'text':'Count', 'target':'Disaster Probability'})
keywords_disaster = keyword_stats.loc[keyword_stats['Disaster Probability']==1]
keywords_no_disaster  = keyword_stats.loc[keyword_stats['Disaster Probability']==0]
keyword_stats.sort_values('Disaster Probability', ascending=False).head(10)

In [None]:
train_texts, val_texts, train_labels , val_labels = train_test_split(
    df_twitter_train['text'].values, df_twitter_train["target"].values, test_size=0.10, random_state=123)

In [None]:
vectorizer = TfidfVectorizer(
                             min_df=2,      # only use words that appear at least X times
                             #stop_words='english', # remove stop words
                             #lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True, # Prevents divide-by-zero errors
                             ngram_range=(1,3),
                             #dtype='int32',
                             analyzer='word',
                             strip_accents = 'unicode',
                             decode_error = 'replace'
                            )
x_train = vectorizer.fit_transform(train_texts)
x_val = vectorizer.transform(val_texts)

In [None]:
selector = SelectKBest(f_classif, k=min(10000, x_train.shape[1]))
selector.fit(x_train, train_labels)
x_train = selector.transform(x_train)
x_val = selector.transform(x_val)

x_train = x_train.astype('float32')
x_val = x_val.astype('float32')

In [None]:
# model parameters
learning_rate=1e-4
epochs=1000
batch_size=128
layers=2
units=64
dropout_rate=0.2

model = models.Sequential()
model.add(Dropout(rate=dropout_rate, input_shape=x_train.shape[1:]))

for _ in range(layers-1):
    model.add(Dense(units=units, activation='relu'))
    model.add(Dropout(rate=dropout_rate))

model.add(Dense(units=1, activation='sigmoid'))

In [None]:
loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

# Create callback for early stopping on validation loss. If the loss does
# not decrease in two consecutive tries, stop training.
callbacks = [tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)]

# K-Fold Cross Validation
# Path de para guardar los modelos
model_path = 'models/TP2.MLP.Iter.'
num_folds = 20

# Definimos los arrays para guardar los resultados
# Train
acc_per_fold_train = []
loss_per_fold_train = []
# Validacion
acc_per_fold_validation = []
loss_per_fold_validation = []

# Definomos el K-fold Cross Validator a usar
kfold = KFold(n_splits=num_folds, shuffle=True)

X = x_train.toarray()
y = train_labels


# Hacemos el bucle para la cantidad de folds definidos
fold_no = 1
print('Total de folds: ', num_folds)
for train, validation in kfold.split(X, y):
    
    # Imprimimos el progreso
    print('------------------------------------------------------------------------')
    print(f'Entrenando, fold {fold_no}...')

    # Train and validate model.
    history = model.fit(X[train], y[train],
            epochs=epochs,
            callbacks=callbacks,
            validation_data = (X[validation], y[validation]),
            verbose=0,  # Logs once per epoch.
            batch_size=batch_size)

    # Print results.
    history = history.history
    print('Validation accuracy: {acc}, loss: {loss}'.format(acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

    # Generamos la metrica de entrenamiento
    scores_train = model.evaluate(X[train], y[train], verbose=0)
    print(f'Puntaje de entrenamiento para el fold {fold_no}: {model.metrics_names[0]} de {scores_train[0]}; {model.metrics_names[1]} de {scores_train[1]*100}%')
    acc_per_fold_train.append(scores_train[1] * 100)
    loss_per_fold_train.append(scores_train[0])

    # Generamos la metrica de test
    scores_validation = model.evaluate(X[validation], y[validation], verbose=0)
    print(f'Puntaje de validación para el fold {fold_no}: {model.metrics_names[0]} de {scores_validation[0]}; {model.metrics_names[1]} de {scores_validation[1]*100}%')
    acc_per_fold_validation.append(scores_validation[1] * 100)
    loss_per_fold_validation.append(scores_validation[0])

    # Guardamos el modelo
    save_model(model, model_path + str(fold_no) + '.h5', save_format='h5')

    # Plotteamos el resultado final
    # plot_history(history)

    # Incrementamos el fold
    fold_no = fold_no + 1
    
print('\n')
print('Puntaje promedio de entrenamiendo, para todos los folds:')
print(f'> Accuracy: {np.mean(acc_per_fold_train)} (+- {np.std(acc_per_fold_train)})')
print(f'> Loss: {np.mean(loss_per_fold_train)}')
print('\n')
print('Puntaje promedio de validación, para todos los folds:')
print(f'> Accuracy: {np.mean(acc_per_fold_validation)} (+- {np.std(acc_per_fold_validation)})')
print(f'> Loss: {np.mean(loss_per_fold_validation)}')
print('------------------------------------------------------------------------')

In [None]:
y = df_twitter_train["target"].values
x_all = vectorizer.transform(df_twitter_train['text'].values)
x_all = selector.transform(x_all)

# Cargamos el modelo con el mejor puntaje
filepath = model_path + str((acc_per_fold_validation.index(max(acc_per_fold_validation)))+1) + '.h5'
loaded_model = load_model(filepath, custom_objects=None, compile=True)

y_predict = loaded_model.predict_classes(x_all.toarray())

score = f1_score(df_twitter_train["target"].values, y_predict, average='weighted')
print("*"*50+"\n MLP Model f1_score: {:.5f}\n".format(score)+"*"*50)

In [None]:
original_sample_submission = pd.read_csv('../data/sample_submission.csv')

test_all = vectorizer.transform(df_twitter_test['text'].values)
test_all = selector.transform(test_all)

y_predict = loaded_model.predict_classes(test_all.toarray())
y_predict[df_twitter_test.loc[df_twitter_test['keyword'].isin(list(keywords_disaster.index) )].index]=1
y_predict[df_twitter_test.loc[df_twitter_test['keyword'].isin(list(keywords_no_disaster.index) )].index]=0

# Path de salida para el submission
submission_path = 'data/submits/submission.MLP.' + datetime.datetime.now().isoformat() + '.csv'
submission_path = submission_path.replace('-','.').replace(':','.')

original_sample_submission["target"] = y_predict
original_sample_submission.to_csv(submission_path, index=False)
original_sample_submission.head()

In [None]:
# SUBMIT
print(submission_path)

In [None]:
# MODELO
print(filepath)

In [None]:
# CNN = pd.read_csv('data/submits/submission.2020.08.03T21.22.36.100631.csv')
# XGBOOST = pd.read_csv('data/submits/submission_XGB_12.csv')
# MLP = pd.read_csv('data/submits/submission_MLP_08.csv')

In [None]:
# ENSAMBLE CON: 0.8109102053325161
# ediccion =(CNN["target"]+XGBOOST["target"]+MLP["target"])/3
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [None]:
# ENSAMBLE CON: 0.8026356114005516
# ediccion =(CNN["target"]+MLP["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [None]:
# ENSAMBLE CON: 0.7716825007661661
# ediccion =(XGBOOST["target"]+MLP["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [None]:
# ENSAMBLE CON: 0.7805700275819798
# ediccion =(CNN["target"]+XGBOOST["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)