In [22]:
import warnings
import pandas as pd
import numpy as np

from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras.layers import Dropout
import tensorflow as tf

# Importacion de librerias necesarias
import re, string, random, datetime
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.metrics import f1_score

In [23]:
# Limpia los datos, mediante el uso de expresiones regulares
def f_remove_noise(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F]+','', text)
    return text

In [24]:
# Path de entrada
train_path = 'data/train.csv'
test_path = 'data/test.csv'

In [25]:
# Carga de los archivos del set de datos
# Set de train: carga
df_twitter_train = pd.read_csv(train_path, sep=',')
# Set de test: carga
df_twitter_test = pd.read_csv(test_path, sep=',')

# Print de los shapes
print('Shape train: ' + str(df_twitter_train.shape))
print('Shape test: ' + str(df_twitter_test.shape))

Shape train: (7613, 5)
Shape test: (3263, 4)


In [26]:
df_twitter_train['text'] = df_twitter_train['text'].apply(lambda x: f_remove_noise(x))
df_twitter_test['text'] = df_twitter_test['text'].apply(lambda x: f_remove_noise(x))

In [27]:
keyword_stats = df_twitter_train.groupby('keyword').agg({'text':np.size, 'target':np.mean}).rename(columns={'text':'Count', 'target':'Disaster Probability'})
keywords_disaster = keyword_stats.loc[keyword_stats['Disaster Probability']==1]
keywords_no_disaster  = keyword_stats.loc[keyword_stats['Disaster Probability']==0]
keyword_stats.sort_values('Disaster Probability', ascending=False).head(10)

Unnamed: 0_level_0,Count,Disaster Probability
keyword,Unnamed: 1_level_1,Unnamed: 2_level_1
debris,37,1.0
wreckage,39,1.0
derailment,39,1.0
outbreak,40,0.975
oil%20spill,38,0.973684
typhoon,38,0.973684
suicide%20bombing,33,0.969697
suicide%20bomber,31,0.967742
bombing,29,0.931034
rescuers,35,0.914286


In [28]:
#keywords_disaster

In [29]:
train_texts, val_texts, train_labels , val_labels = train_test_split(
    df_twitter_train['text'].values, df_twitter_train["target"].values, test_size=0.15, random_state=123)

In [30]:
vectorizer = TfidfVectorizer(
                             min_df=2,      # only use words that appear at least X times
                             #stop_words='english', # remove stop words
                             #lowercase=True, # Convert everything to lower case 
                             use_idf=True,   # Use idf
                             norm=u'l2',     # Normalization
                             smooth_idf=True, # Prevents divide-by-zero errors
                             ngram_range=(1,3),
                             #dtype='int32',
                             analyzer='word',
                             strip_accents = 'unicode',
                             decode_error = 'replace'
                            )
x_train = vectorizer.fit_transform(train_texts)
x_val = vectorizer.transform(val_texts)

In [31]:
selector = SelectKBest(f_classif, k=min(10000, x_train.shape[1]))
selector.fit(x_train, train_labels)
x_train = selector.transform(x_train)
x_val = selector.transform(x_val)

x_train = x_train.astype('float32')
x_val = x_val.astype('float32')

In [32]:
# model parameters
learning_rate=1e-4
epochs=1000
batch_size=128
layers=2
units=64
dropout_rate=0.2

model = models.Sequential()
model.add(Dropout(rate=dropout_rate, input_shape=x_train.shape[1:]))

for _ in range(layers-1):
    model.add(Dense(units=units, activation='relu'))
    model.add(Dropout(rate=dropout_rate))

model.add(Dense(units=1, activation='sigmoid'))

In [33]:
loss = 'binary_crossentropy'
optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
model.compile(optimizer=optimizer, loss=loss, metrics=['acc'])

# Create callback for early stopping on validation loss. If the loss does
# not decrease in two consecutive tries, stop training.
callbacks = [tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=5)]

# Train and validate model.
history = model.fit(
        x_train.toarray(),
        train_labels,
        epochs=epochs,
        callbacks=callbacks,
        validation_data=(x_val.toarray(), val_labels),
        verbose=0,  # Logs once per epoch.
        batch_size=batch_size)

# Print results.
history = history.history
print('Validation accuracy: {acc}, loss: {loss}'.format(
        acc=history['val_acc'][-1], loss=history['val_loss'][-1]))

Validation accuracy: 0.8099824786186218, loss: 0.44409993290901184


In [34]:
y = df_twitter_train["target"].values

x_all = vectorizer.transform(df_twitter_train['text'].values)
x_all = selector.transform(x_all)
y_predict = model.predict_classes(x_all.toarray())

score = f1_score(df_twitter_train["target"].values, y_predict, average='weighted')
print("*"*50+"\n MLP Model f1_score: {:.5f}\n".format(score)+"*"*50)

**************************************************
 MLP Model f1_score: 0.41429
**************************************************


In [35]:
original_test = pd.read_csv('../data/test.csv')
original_sample_submission = pd.read_csv('../data/sample_submission.csv')

test_all = vectorizer.transform(original_test['text'].values)
test_all = selector.transform(test_all)

y_predict = model.predict_classes(test_all.toarray())
y_predict[original_test.loc[original_test['keyword'].isin(list(keywords_disaster.index) )].index]=1
y_predict[original_test.loc[original_test['keyword'].isin(list(keywords_no_disaster.index) )].index]=0

original_sample_submission["target"] = y_predict
original_sample_submission.to_csv("data/submits/submission_MLP_08.csv", index=False)
original_sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [36]:
CNN = pd.read_csv('data/submits/submission.2020.08.03T21.22.36.100631.csv')
XGBOOST = pd.read_csv('data/submits/submission_XGB_12.csv')
MLP = pd.read_csv('data/submits/submission_MLP_08.csv')

In [37]:
# ENSAMBLE CON: 0.8109102053325161
ediccion =(CNN["target"]+XGBOOST["target"]+MLP["target"])/3
y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [38]:
# ENSAMBLE CON: 0.8026356114005516
# ediccion =(CNN["target"]+MLP["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [39]:
# ENSAMBLE CON: 0.7716825007661661
# ediccion =(XGBOOST["target"]+MLP["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [40]:
# ENSAMBLE CON: 0.7805700275819798
# ediccion =(CNN["target"]+XGBOOST["target"])/2
# y_pred_ENS = np.where(ediccion>0.5, 1, 0)

In [41]:
original_test = pd.read_csv('data/test.csv')
kaggle_submission = pd.DataFrame(original_test,columns = ['id'])
kaggle_submission["target"] = y_pred_ENS
kaggle_submission["target"].value_counts()

0    2440
1     823
Name: target, dtype: int64

In [42]:
kaggle_submission.to_csv("data/submits/submission_ENSAMBLE_CNN_MLP_XGB_08.csv", index=False)