In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model,Sequential
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, Bidirectional
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
import tensorflow as tf
%matplotlib inline

In [2]:
tweets = pd.read_csv('train.csv',delimiter=',',encoding='latin-1')
tweets.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
tweets.drop(['id', 'keyword', 'location'],axis=1,inplace=True)
tweets.duplicated(subset = 'text', keep = False)
tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
import string
tweets['text'] = tweets['text'].apply(lambda x: x.translate({ord(i): ' ' for i in string.punctuation}))
tweets.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this earthquake M...,1
1,Forest fire near La Ronge Sask Canada,1
2,All residents asked to shelter in place are ...,1
3,13 000 people receive wildfires evacuation or...,1
4,Just got sent this photo from Ruby Alaska as ...,1


In [5]:
tweets['text'] = tweets['text'].apply(lambda x: x.lower())

In [6]:
import io
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\franc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def remove_stopword(text):
    new_text = []
    for e in text:
        if e not in stopwords and e.isalpha():
            new_text.append(e)
    text = new_text
    return " ".join(new_text)

In [8]:
tweets['text'] = tweets['text'].str.split()
tweets['text'] = tweets['text'].apply(remove_stopword)
tweets.head()

Unnamed: 0,text,target
0,deeds reason earthquake may allah forgive us,1
1,forest fire near la ronge sask canada,1
2,residents asked shelter place notified officer...,1
3,people receive wildfires evacuation orders cal...,1
4,got sent photo ruby alaska smoke wildfires pou...,1


In [9]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer('english')
def stemm(text):
    text = [stemmer.stem(word) for word in text.split()]
    return " ".join(text)
tweets['text'] = tweets['text'].apply(stemm)
tweets.head()

Unnamed: 0,text,target
0,deed reason earthquak may allah forgiv us,1
1,forest fire near la rong sask canada,1
2,resid ask shelter place notifi offic evacu she...,1
3,peopl receiv wildfir evacu order california,1
4,got sent photo rubi alaska smoke wildfir pour ...,1


In [35]:
X_train = tweets.text
Y_train = tweets.target
le = LabelEncoder()
Y_train = le.fit_transform(Y_train)
Y_train = Y_train.reshape(-1,1)

In [37]:
#Comentar para generar submit
X_train,X_test,Y_train,Y_test = train_test_split(X_train,Y_train,test_size=0.25)

In [38]:
max_words = 10000
max_len = 150
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [39]:
def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,50,input_length=max_len)(inputs)
    layer = LSTM(64)(layer)
    layer = Dense(10,name='out_layer_1')(layer)
    layer = Dropout(0.1)(layer)
    layer = Dense(1,name='out_layer_2')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [46]:
model = RNN()
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])
model.fit(sequences_matrix,Y_train,batch_size=65,epochs=2,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss',patience=2)])

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1873a997208>

In [47]:
#Comentar para generar submit
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
accr = model.evaluate(test_sequences_matrix,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.455
  Accuracy: 0.802


LSTM - Test.csv

In [26]:
tweets_test = pd.read_csv('test.csv',delimiter=',',encoding='latin-1')
tweets_test.drop(['keyword', 'location'],axis=1,inplace=True)
tweets_test.duplicated(subset = 'text', keep = False)
tweets_test['text'] = tweets_test['text'].str.split()
tweets_test['text'] = tweets_test['text'].apply(remove_stopword)
tweets_test['text'] = tweets_test['text'].apply(stemm)
tweets_test.head()

Unnamed: 0,id,text
0,0,just happen terribl car crash
1,2,heard differ stay safe
2,3,forest fire spot gees flee across i cannot save
3,9,apocalyps
4,11,typhoon soudelor kill china taiwan


In [27]:
X_test = tweets_test.text

In [28]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [32]:
submission = pd.DataFrame()
submission1['id'] = tweets_test['id']
submission1['prob'] = model.predict(test_sequences_matrix)
submission1['target'] = submission1['prob'].apply(lambda x: 0 if x < .5 else 1)
del submission1["prob"]
submission1.head(10)

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [34]:
submission1.to_csv("submit_prueba_7.csv", index=False)