## BaggingClassifier + TF-IDF 

In [1]:
import pandas as pd
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
stop_words = stopwords.words('english')

In [2]:
training_data = pd.read_csv('train.csv')

#### Combining the 3 columns ( keyword + location + text ) - filling the NAN by a blank or ''

In [3]:
training_data['text'] = training_data['keyword'].fillna('') + training_data['location'].fillna('')  \
                        + training_data['text'].fillna('')
training_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
training_data = training_data.drop(columns=['id','keyword','location'],axis=1)
training_data.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


#### Data Cleaning step involves tokenizing, removing the stopwords and removing non-alphanumeric tokens

In [5]:
def clean_data(text):
    tokens = text.split()
    no_stopwords = [x for x in tokens if x not in stop_words]
    no_nonalphanum = [x.lower() for x in no_stopwords if x.isalnum()]
    return ' '.join(no_nonalphanum)

In [6]:
training_data['text'] = training_data['text'].apply(clean_data)
print(training_data.shape)
training_data.head()

(7613, 2)


Unnamed: 0,text,target
0,our deeds reason may allah forgive us,1
1,forest fire near la ronge canada,1
2,all residents asked notified no evacuation she...,1
3,people receive evacuation orders california,1
4,just got sent photo ruby smoke pours school,1


In [7]:
test_data = pd.read_csv('test.csv')
test_id = test_data['id']
test_data['text'] = test_data['keyword'].fillna('') + test_data['location'].fillna('')  \
                        + test_data['text'].fillna('')
test_data = test_data.drop(columns=['id','keyword','location'],axis=1)
test_data['text'] = test_data['text'].apply(clean_data)
print(test_data.shape)
test_data.head()

(3263, 1)


Unnamed: 0,text
0,just happened terrible car crash
1,heard different stay safe
2,forest fire spot geese fleeing across i cannot...
3,apocalypse
4,typhoon soudelor kills 28 china taiwan


In [8]:
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf

#### download pre-trained word embeddings model

In [9]:
embedding = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"

In [10]:
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)

#### creating neural network of 4 layers, having a word embeddings layer

In [11]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 20)                400020    
_________________________________________________________________
dense (Dense)                (None, 16)                336       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 400,373
Trainable params: 400,373
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [13]:
x_train,x_test,y_train,y_test = train_test_split(training_data.text,training_data.target)

In [14]:
history = model.fit(x_train,y_train,epochs=50)

Train on 5709 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [15]:
results = model.evaluate(x_test,y_test, verbose=2)

1904/1904 - 0s - loss: 1.8421 - accuracy: 0.7206


In [16]:
predictions = model.predict_classes(x_test)
accuracy_score(y_test,predictions)

0.7205882352941176

In [17]:
history = model.fit(training_data.text,training_data.target,epochs=50)

Train on 7613 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [18]:
test_data = test_data['text']
predictions = model.predict_classes(test_data)

In [19]:
predictions = predictions.reshape(-1)
results = pd.DataFrame({
                        'id':test_id,
                        'target':predictions
                        })
results.to_csv('submissions.csv',index=False)

#### Accuracy :---
#####                - Local Accuracy : 73.52 (over split training data)
#####                - Online Accuracy : 72.59 (After fitting over all training data)