In [1]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
tf.__version__

'2.5.0'

In [3]:
train = pd.read_csv('train.csv')

In [4]:
train.shape

(7613, 5)

In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
X = train.iloc[:,:-1]
X.head()

Unnamed: 0,id,keyword,location,text
0,1,,,Our Deeds are the Reason of this #earthquake M...
1,4,,,Forest fire near La Ronge Sask. Canada
2,5,,,All residents asked to 'shelter in place' are ...
3,6,,,"13,000 people receive #wildfires evacuation or..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...


In [7]:
X = X.drop(['keyword','location'],axis=1)

In [8]:
X.shape

(7613, 2)

In [9]:
y = train['target']

In [10]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [11]:
X.isnull().values.any()

False

In [12]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from keras.layers import Dropout

In [13]:
tweets = X.copy()

In [14]:
tweets['text'][1]

'Forest fire near La Ronge Sask. Canada'

In [15]:
tweets.reset_index(inplace=True)

<h3>Text Cleaning and preprocessing </h3>

In [16]:
import nltk
from nltk.corpus import stopwords

In [17]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def text_preprocess(text):
    text0 = re.sub('[^a-zA-Z]', ' ',text)
    text0 = re.sub(r"\s+[a-zA-Z]\s+", ' ', text0)
    text0 = re.sub(r'\s+', ' ', text0)
    text0 = text0.lower()
    text0 = text0.split()
    text0 = [ps.stem(word) for word in text0 if not word in stopwords.words('english')]
    text0 = ' '.join(text0)
    return text0
    

    
    
        

In [18]:
corpus=[]
for i in range(len(tweets)):
    review = text_preprocess(tweets['text'][i])
    corpus.append(review)

In [19]:
corpus

['deed reason earthquak may allah forgiv us',
 'forest fire near la rong sask canada',
 'resid ask shelter place notifi offic evacu shelter place order expect',
 'peopl receiv wildfir evacu order california',
 'got sent photo rubi alaska smoke wildfir pour school',
 'rockyfir updat california hwi close direct due lake counti fire cafir wildfir',
 'flood disast heavi rain caus flash flood street manit colorado spring area',
 'top hill see fire wood',
 'emerg evacu happen build across street',
 'afraid tornado come area',
 'three peopl die heat wave far',
 'haha south tampa get flood hah wait second live south tampa gonna gonna fvck flood',
 'rain flood florida tampabay tampa day lost count',
 'flood bago myanmar arriv bago',
 'damag school bu multi car crash break',
 'man',
 'love fruit',
 'summer love',
 'car fast',
 'goooooooaaaaaal',
 'ridicul',
 'london cool',
 'love ski',
 'wonder day',
 'looooool',
 'way eat shit',
 'nyc last week',
 'love girlfriend',
 'cooool',
 'like pasta',
 '

In [20]:
voc_size = 5000

In [21]:
onehot_repr = [one_hot(words,voc_size) for words in corpus]
onehot_repr

[[75, 2381, 3355, 2639, 2951, 3600, 1254],
 [1666, 210, 4726, 3410, 1817, 4396, 4873],
 [1717, 2590, 2656, 3289, 2868, 1023, 2770, 2656, 3289, 3286, 112],
 [1072, 1602, 3114, 2770, 3286, 1380],
 [2932, 3052, 581, 3170, 1350, 409, 3114, 2143, 4887],
 [4314, 1572, 1380, 1818, 479, 2356, 72, 4532, 4565, 210, 3895, 3114],
 [1520, 2964, 1527, 1053, 4146, 1415, 1520, 451, 8, 2037, 2990, 1487],
 [2579, 2382, 4871, 210, 3986],
 [4110, 2770, 3068, 1030, 1817, 451],
 [524, 1764, 3295, 1487],
 [2775, 1072, 231, 3728, 4269, 3052],
 [4423,
  1595,
  367,
  518,
  1520,
  4930,
  1462,
  1154,
  3523,
  1595,
  367,
  2947,
  2947,
  381,
  1520],
 [1053, 1520, 4564, 1029, 367, 4795, 2240, 380],
 [1520, 4538, 2202, 4219, 4538],
 [3338, 4887, 1230, 440, 3804, 2439, 3599],
 [783],
 [2651, 3758],
 [2730, 2651],
 [3804, 3720],
 [1467],
 [3585],
 [2584, 3244],
 [2651, 1316],
 [802, 4795],
 [418],
 [4483, 728, 4763],
 [4753, 1486, 3000],
 [2651, 3398],
 [3227],
 [4922, 3776],
 [126],
 [624, 1858, 4869, 20

<h3>Embedding</h3>

In [22]:
sent_length = 20
embedded_docs= pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[   0    0    0 ... 2951 3600 1254]
 [   0    0    0 ... 1817 4396 4873]
 [   0    0    0 ... 3289 3286  112]
 ...
 [   0    0    0 ... 3395 4273 4287]
 [   0    0    0 ... 2048 4479 1218]
 [   0    0    0 ... 3395  735 3568]]


In [23]:
embedded_docs[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,   75, 2381, 3355, 2639, 2951, 3600, 1254])

In [24]:
#Creating model
embedding_dim = 100
model = Sequential()
model.add(Embedding(voc_size,embedding_dim,input_length=sent_length))
model.add(Dropout(0.3))
model.add(LSTM(100))
model.add(Dense(1,activation='relu'))
model.add(Dropout(0.3))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 100)           500000    
_________________________________________________________________
module_wrapper (ModuleWrappe (None, 20, 100)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
_________________________________________________________________
module_wrapper_1 (ModuleWrap (None, 1)                 0         
Total params: 580,501
Trainable params: 580,501
Non-trainable params: 0
_________________________________________________________________
None


In [25]:
len(embedded_docs),y.shape

(7613, (7613,))

In [26]:
X_final = np.array(embedded_docs)
y_final=np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.33, random_state=42)

In [28]:
model.fit(X_train,y_train,epochs=25,batch_size=200)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<tensorflow.python.keras.callbacks.History at 0x1cbb2a6d8b0>

In [29]:
y_pred=model.predict_classes(X_test)

In [30]:
y_pred

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [32]:
test = pd.read_csv('test.csv')

In [33]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [34]:
test =  test.drop(['keyword','location'],axis=1)

In [35]:
test.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [36]:
corpus=[]
for i in range(len(test)):
    review = text_preprocess(test['text'][i])
    corpus.append(review)
    
onehot_repr = [one_hot(words,voc_size) for words in corpus]

embedded_docs= pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
X_1 = np.array(embedded_docs)
y_pred = model.predict_classes(X_1)
y_pred



array([[0],
       [1],
       [1],
       ...,
       [0],
       [1],
       [0]])

In [37]:
test['target'] = y_pred

In [38]:
test.head()

Unnamed: 0,id,text,target
0,0,Just happened a terrible car crash,0
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1


In [39]:
results_pred = test.drop('text',axis=1)
results_pred.reset_index(drop=True)

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,0
3261,10874,1


In [40]:
results_pred.to_csv('results_predv1.csv')

<h4>acheived an accuracy of 73.3 % as compared to 54% by using Tfifd and Multinomial NB classifier</h4>