In [43]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [44]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [45]:
test_data.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [46]:
train_data.drop(["id","keyword","location"],axis=1,inplace=True)
test_data.drop(["id","keyword","location"],axis=1,inplace=True)

In [47]:
train_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [48]:
test_data.head()

Unnamed: 0,text
0,Just happened a terrible car crash
1,"Heard about #earthquake is different cities, s..."
2,"there is a forest fire at spot pond, geese are..."
3,Apocalypse lighting. #Spokane #wildfires
4,Typhoon Soudelor kills 28 in China and Taiwan


In [49]:
train_data.shape

(7613, 2)

In [50]:
test_data.shape

(3263, 1)

In [51]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [52]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [53]:
lemm=WordNetLemmatizer()
stop_words = stopwords.words("english")

In [54]:
def remove_stopwords(text):
    no_stop = []
    for word in text.split(' '):
        if word not in stop_words:
            no_stop.append(word)
    return " ".join(no_stop)

def clean_text(text):
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'#', '', text)
    return text

In [55]:
train_data['new_text'] = train_data['text'].apply(lambda x:x.lower())
train_data['new_text'] = train_data['new_text'].apply(lambda x:lemm.lemmatize(x))
train_data['new_text'] = train_data['new_text'].apply(lambda x:remove_stopwords(x))
train_data['new_text'] = train_data['new_text'].apply(lambda x:clean_text(x))

In [56]:
X = train_data['new_text']
y = train_data['target']

In [57]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [58]:
token = Tokenizer(num_words=10000,oov_token="<OOV>")
token.fit_on_texts(X) #only once- that is on training
word_index = token.word_index
training_seq = token.texts_to_sequences(X)
train_padded = pad_sequences(training_seq,padding="post",truncating="post",maxlen=50)

In [59]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(train_padded,y,test_size=0.2,random_state=42)

In [60]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(10000,128))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(64,return_sequences=True)))
model.add(keras.layers.Bidirectional(keras.layers.LSTM(64)))
model.add(keras.layers.Dense(128,activation="relu"))
model.add(keras.layers.Dropout(0.4))
model.add(keras.layers.Dense(1,activation="sigmoid"))

In [61]:
model.compile("rmsprop", "binary_crossentropy", metrics=["accuracy"])

In [37]:
history = model.fit(X_train,y_train,epochs=2,validation_data=(X_test,y_test))

Epoch 1/2
Epoch 2/2


In [62]:
test_seq = token.texts_to_sequences(test_data['text'])
test_pad = pad_sequences(test_seq,padding="post",truncating="post",maxlen=50)

In [63]:
pred = model.predict(test_pad)



In [64]:
test_data.shape

(3263, 1)

In [65]:
len(pred)

3263

In [66]:
pred

array([[0.49425974],
       [0.4938429 ],
       [0.4962381 ],
       ...,
       [0.4939304 ],
       [0.4944204 ],
       [0.4942906 ]], dtype=float32)

In [67]:
target = (pred > 0.5).astype(int)
test_data['target'] = target

In [68]:
test_data.shape

(3263, 2)

## GRU

In [69]:
gru_model = keras.models.Sequential()
gru_model.add(keras.layers.Embedding(10000,128))
gru_model.add(keras.layers.Bidirectional(keras.layers.GRU(64,return_sequences=True)))
gru_model.add(keras.layers.Bidirectional(keras.layers.GRU(64)))
gru_model.add(keras.layers.Dense(128,activation="relu"))
gru_model.add(keras.layers.Dropout(0.4))
gru_model.add(keras.layers.Dense(1,activation="sigmoid"))

In [70]:
gru_model.compile("rmsprop", "binary_crossentropy", metrics=["accuracy"])
history = gru_model.fit(X_train,y_train,epochs=2,validation_data=(X_test,y_test))

Epoch 1/2
Epoch 2/2
