In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("train.csv")

In [None]:
data.head()

In [None]:
data.tail(60)

In [None]:
data[data['keyword'].notnull() & data['location'].notnull()]

In [None]:
data.target.value_counts()

# SMALL ANALYSIS

In [None]:
print("Total % of Negative and Positives are: ")
plt.pie(data['target'].value_counts(),
        labels=['Disaster','Non-Disaster'],
        colors=['r','b'],
       autopct='%.3f%%',
       startangle=180)
plt.show()

In [None]:
keywords = [] #getting all keywords 
keyword_count = [] #getting all keywords count
def count_keywords(x):
    if x not in keywords:
        keywords.append(x)
        keyword_count.append(len(data[data['keyword']==x]))
data.keyword.apply(count_keywords)

In [None]:
plt.figure(figsize=(30,10))
plt.rcParams.update({'font.size':22})
plt.hist(keywords[:30],max(keyword_count[:30]))
plt.xticks(rotation='vertical',color='w')
plt.plot()

### PLOTTING ALL KEYWORDS

In [None]:
plt.figure(figsize=(100,10))
plt.rcParams.update({'font.size':32})
data['keyword'].hist(color='r')
plt.xticks(rotation='vertical',color='w')
plt.plot()

### Removing All NaN Columns

In [None]:
without_nan = data[data['keyword'].notnull() & data['location'].notnull()]

In [None]:
without_nan

In [None]:
lens = without_nan.text.str.len()

In [None]:
print("MEAN is {}".format(np.mean(lens)),
      "Median is {}".format(np.median(lens)),
      "Max length tweet is {}".format(max(lens)),sep='\n')

### Histogram of the lengths of all texts 

In [None]:
plt.figure(figsize=(30,10))
lens.hist()

### Deep Learning AND  Machine Learning Part

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

In [None]:
def RemoveEmojis(x):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'',x)

In [None]:
removed = without_nan.text.apply(RemoveEmojis)

In [None]:
without_nan.text = removed

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split

#### Removing Stop Words

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
stop_words #all stop words

#### Lemmatization with stop Words Removal

In [47]:
lemmer = WordNetLemmatizer()

In [48]:

def RemovingStopWords(text):
    x = []
    tokens = word_tokenize(text)
    for token in tokens:
        lemmered_token = lemmer.lemmatize(token)
        x.append(lemmered_token)
    removed_words = [i for i in x if i not in stop_words]
    return TreebankWordDetokenizer().detokenize(removed_words)

In [50]:
stoped_words = without_nan.text.apply(RemovingStopWords)

In [51]:
stoped_words

31      @ bbcmtd Wholesale Markets ablaze http: //t.co...
32      We always try bring heavy . #metal #RT http: /...
33      # AFRICANBAZE: Breaking news: Nigeria flag set...
34                                     Crying! Set ablaze
35      On plus side LOOK AT THE SKY LAST NIGHT IT WAS...
                              ...                        
7575     On bright side I wrecked http: //t.co/uEa0txRHYs
7577    @ widda16...He's gone . You relax . I thought ...
7579    Three day work've pretty much wrecked hahaha s...
7580    # FX #forex #trading Cramer: Iger's 3 word wre...
7581    @ engineshed Great atmosphere British Lion gig...
Name: text, Length: 5080, dtype: object

In [52]:
without_nan.text=stoped_words

In [53]:
without_nan

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@ bbcmtd Wholesale Markets ablaze http: //t.co...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try bring heavy . #metal #RT http: /...,0
33,50,ablaze,AFRICA,# AFRICANBAZE: Breaking news: Nigeria flag set...,1
34,52,ablaze,"Philadelphia, PA",Crying! Set ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0
...,...,...,...,...,...
7575,10826,wrecked,TN,On bright side I wrecked http: //t.co/uEa0txRHYs,0
7577,10829,wrecked,#NewcastleuponTyne #UK,@ widda16...He's gone . You relax . I thought ...,0
7579,10831,wrecked,"Vancouver, Canada",Three day work've pretty much wrecked hahaha s...,0
7580,10832,wrecked,London,# FX #forex #trading Cramer: Iger's 3 word wre...,0


In [54]:
input_data = without_nan.text #input Data

In [55]:
labels = without_nan.target #input Labels

# Applying on Recurrent Neural Network

In [56]:
tokens = Tokenizer(num_words=10000)
tokens.fit_on_texts(input_data)

In [57]:
padded_text = tokens.texts_to_sequences(input_data) #Transforming each text to a sequence of integers

In [58]:
padded_text

62,
  124,
  153,
  3,
  2,
  1,
  6963],
 [235, 2791, 939, 462, 100, 217, 191, 3, 2, 1, 6964],
 [235, 2791, 939, 462, 100, 217, 191, 3, 2, 1, 6965],
 [235, 2791, 939, 462, 100, 217, 191, 3, 2, 1, 6966],
 [6967,
  6968,
  6969,
  219,
  6970,
  510,
  6971,
  10,
  6972,
  6973,
  10,
  6974,
  104,
  6975,
  6976,
  6977,
  191],
 [100, 191, 3821, 3822, 2275, 133, 81, 6978, 710, 3, 2, 1, 6979],
 [235, 934, 939, 462, 100, 217, 191, 1415, 3, 2, 1, 6980],
 [143, 6, 7, 417, 217, 191, 100, 3, 2, 1, 6981],
 [4, 97, 191, 514, 34, 3823, 3, 2, 1, 6982],
 [235, 1195, 934, 939, 462, 217, 191, 100, 2276, 414, 1088, 263, 6983],
 [235,
  934,
  939,
  462,
  100,
  217,
  191,
  3824,
  6984,
  100,
  1195,
  235,
  3825,
  6985,
  3,
  2,
  1,
  6986],
 [235, 934, 939, 462, 100, 217, 191, 6987, 3, 2, 1, 6988],
 [8,
  6989,
  112,
  1023,
  1680,
  6990,
  6991,
  15,
  6992,
  6993,
  3,
  2,
  1,
  6994,
  15,
  2777,
  475],
 [3826, 566, 3827, 429, 1635, 106, 479, 72, 2124, 206, 429, 6995, 6996]

In [59]:
sent = without_nan.text

In [60]:
r = []
for index,i in enumerate(sent):
    #print(i)
    x = word_tokenize(i)
    r.append(len(x))
    
print("The max length of the sentence is {}".format(max(r)))
max_len = max(r)

The max length of the sentence is 70


In [61]:
padded_text = pad_sequences(padded_text,maxlen=max_len)

In [62]:
padded_text

array([[   0,    0,    0, ...,    2,    1, 5193],
       [   0,    0,    0, ...,    2,    1, 5194],
       [   0,    0,    0, ...,    2,    1, 5196],
       ...,
       [   0,    0,    0, ..., 5126,   54,   19],
       [   0,    0,    0, ...,    3,    2,    1],
       [   0,    0,    0, ...,    3,    2,    1]])

In [63]:
padded_text

array([[   0,    0,    0, ...,    2,    1, 5193],
       [   0,    0,    0, ...,    2,    1, 5194],
       [   0,    0,    0, ...,    2,    1, 5196],
       ...,
       [   0,    0,    0, ..., 5126,   54,   19],
       [   0,    0,    0, ...,    3,    2,    1],
       [   0,    0,    0, ...,    3,    2,    1]])

### Label Encoding

In [64]:
from keras.utils import to_categorical

In [65]:
encoded_label = to_categorical(labels)

In [66]:
encoded_label

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [67]:
xtrain,xtest,ytrain,ytest = train_test_split(padded_text,encoded_label,test_size=0.2,random_state=8)

In [68]:
xtrain.shape,xtest.shape

((4064, 70), (1016, 70))

In [69]:
ytrain.shape,ytest.shape

((4064, 2), (1016, 2))

# Creating the BaseLine

In [71]:
from keras.layers import SimpleRNN,Embedding,Dense,Dropout,Flatten
from keras.models import Sequential


In [90]:
def BaseLine():
    model = Sequential()
    model.add(Embedding(10000,64,input_length=70))
    model.add(Flatten())
    model.add(Dense(2,activation='sigmoid'))
    model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
    return model

In [91]:
xmodel = BaseLine()

In [92]:
xmodel.fit(xtrain,ytrain,epochs=30,batch_size=128)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.callbacks.History at 0x152b84eed88>

In [93]:
xmodel.evaluate(xtest,ytest)



[0.6956437767490627, 0.7598425149917603]