In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder


import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
train_df = pd.read_csv('https://raw.githubusercontent.com/pj423/Disaster-Tweets/main/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
correct_predictions = pd.read_csv('https://raw.githubusercontent.com/pj423/Disaster-Tweets/main/Test%20Predictions.csv')
correct_predictions.head()  

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [None]:
test_df = pd.read_csv('https://raw.githubusercontent.com/pj423/Disaster-Tweets/main/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip3 install sentencepiece

import tokenization

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])

    return model

In [None]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 40.5 s, sys: 8.95 s, total: 49.5 s
Wall time: 52.4 s


In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
train_input = bert_encode(train_df.text.values, tokenizer, max_len=160)
test_input = bert_encode(test_df.text.values, tokenizer, max_len=160)
train_labels = train_df.target.values

In [None]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 160)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 160)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 160)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 1024),       335141889   ['input_word_ids[0][0]',         
                                 (None, 160, 1024)]               'input_mask[0][0]',         

  super(Adam, self).__init__(name, **kwargs)


In [None]:
train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    batch_size=4
)

model.save('model.h5')

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [None]:
test_pred = model.predict(test_input)

In [None]:
## filling submission.csv
submission = pd.read_csv("https://raw.githubusercontent.com/pj423/Disaster-Tweets/main/sample_submission.csv")

submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)

In [None]:
import sklearn.metrics

sklearn.metrics.f1_score(correct_predictions["target"], submission["target"], average='macro')

0.8274263269609122

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer, HashingVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit_transform(Xtrain)

keyword = vectorizer.get_feature_names()
x_train = vectorizer.transform(Xtrain)
x_test = vectorizer.transform(Xtest)



In [None]:
from sklearn.naive_bayes import ComplementNB
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
nb = ComplementNB()
clf = GridSearchCV(nb, param_grid, verbose = 10)
clf.fit(x_train, Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5; 1/8] START alpha=1e-05.................................................
[CV 1/5; 1/8] END ..................alpha=1e-05;, score=0.659 total time=   0.0s
[CV 2/5; 1/8] START alpha=1e-05.................................................
[CV 2/5; 1/8] END ..................alpha=1e-05;, score=0.558 total time=   0.0s
[CV 3/5; 1/8] START alpha=1e-05.................................................
[CV 3/5; 1/8] END ..................alpha=1e-05;, score=0.594 total time=   0.0s
[CV 4/5; 1/8] START alpha=1e-05.................................................
[CV 4/5; 1/8] END ..................alpha=1e-05;, score=0.642 total time=   0.0s
[CV 5/5; 1/8] START alpha=1e-05.................................................
[CV 5/5; 1/8] END ..................alpha=1e-05;, score=0.620 total time=   0.0s
[CV 1/5; 2/8] START alpha=0.0001................................................
[CV 1/5; 2/8] END .................alpha=0.0001;,

GridSearchCV(estimator=ComplementNB(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10,
                                   100]},
             verbose=10)

In [None]:
clf.cv_results_ 

{'mean_fit_time': array([0.00517864, 0.00490866, 0.00607986, 0.0062387 , 0.00624065,
        0.00655684, 0.00618439, 0.00627298]),
 'mean_score_time': array([0.00096292, 0.00092211, 0.00116239, 0.00092988, 0.00093679,
        0.00095096, 0.00093193, 0.00090752]),
 'mean_test_score': array([0.61434819, 0.61723792, 0.61986535, 0.62498811, 0.63208188,
        0.66899439, 0.68646656, 0.67976709]),
 'param_alpha': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 1e-05},
  {'alpha': 0.0001},
  {'alpha': 0.001},
  {'alpha': 0.01},
  {'alpha': 0.1},
  {'alpha': 1},
  {'alpha': 10},
  {'alpha': 100}],
 'rank_test_score': array([8, 7, 6, 5, 4, 3, 1, 2], dtype=int32),
 'split0_test_score': array([0.65856861, 0.66119501, 0.66382141, 0.67038739, 0.66382141,
        0.67367039, 0.67038739, 0.65331582]),
 'split1_test_score': array([0.5

In [None]:
clf.best_score_

0.6864665578950183

In [None]:
clf.best_params_

{'alpha': 10}

In [None]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
param_grid = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]}
nb = BernoulliNB()
clf = GridSearchCV(nb, param_grid, verbose = 10)
clf.fit(x_train, Y)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5; 1/8] START alpha=1e-05.................................................
[CV 1/5; 1/8] END ..................alpha=1e-05;, score=0.700 total time=   0.0s
[CV 2/5; 1/8] START alpha=1e-05.................................................
[CV 2/5; 1/8] END ..................alpha=1e-05;, score=0.613 total time=   0.0s
[CV 3/5; 1/8] START alpha=1e-05.................................................
[CV 3/5; 1/8] END ..................alpha=1e-05;, score=0.640 total time=   0.0s
[CV 4/5; 1/8] START alpha=1e-05.................................................
[CV 4/5; 1/8] END ..................alpha=1e-05;, score=0.679 total time=   0.0s
[CV 5/5; 1/8] START alpha=1e-05.................................................
[CV 5/5; 1/8] END ..................alpha=1e-05;, score=0.675 total time=   0.0s
[CV 1/5; 2/8] START alpha=0.0001................................................
[CV 1/5; 2/8] END .................alpha=0.0001;,

GridSearchCV(estimator=BernoulliNB(),
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10,
                                   100]},
             verbose=10)

In [None]:
clf.cv_results_ 

{'mean_fit_time': array([0.0072144 , 0.0070652 , 0.00836539, 0.00714288, 0.00765791,
        0.00683041, 0.00677466, 0.00750136]),
 'mean_score_time': array([0.00308619, 0.0030014 , 0.0038712 , 0.00287156, 0.0028707 ,
        0.00297899, 0.00283594, 0.00353665]),
 'mean_test_score': array([0.66124152, 0.66334332, 0.66872847, 0.67779083, 0.69434152,
        0.73467049, 0.5703402 , 0.5703402 ]),
 'param_alpha': masked_array(data=[1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'alpha': 1e-05},
  {'alpha': 0.0001},
  {'alpha': 0.001},
  {'alpha': 0.01},
  {'alpha': 0.1},
  {'alpha': 1},
  {'alpha': 10},
  {'alpha': 100}],
 'rank_test_score': array([6, 5, 4, 3, 2, 1, 7, 7], dtype=int32),
 'split0_test_score': array([0.69993434, 0.69862114, 0.70453053, 0.71569271, 0.72948129,
        0.72816809, 0.56992777, 0.56992777]),
 'split1_test_score': array([0.6

In [None]:
clf.best_score_

0.7346704883421353

In [None]:
clf.best_params_

{'alpha': 1}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1],
              'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]}
lr = LogisticRegression(max_iter = 10000, penalty = 'elasticnet', solver = 'saga')
clf = GridSearchCV(lr, param_grid, verbose = 8)
clf.fit(x_train, Y)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV 1/5] END ...............C=1e-05, l1_ratio=0;, score=0.570 total time=   4.2s
[CV 2/5] END ...............C=1e-05, l1_ratio=0;, score=0.571 total time=   4.2s
[CV 3/5] END ...............C=1e-05, l1_ratio=0;, score=0.571 total time=   4.9s
[CV 4/5] END ...............C=1e-05, l1_ratio=0;, score=0.570 total time=   4.6s
[CV 5/5] END ...............C=1e-05, l1_ratio=0;, score=0.570 total time=   3.8s
[CV 1/5] END .............C=1e-05, l1_ratio=0.2;, score=0.570 total time=   0.0s
[CV 2/5] END .............C=1e-05, l1_ratio=0.2;, score=0.571 total time=   0.0s
[CV 3/5] END .............C=1e-05, l1_ratio=0.2;, score=0.571 total time=   0.0s
[CV 4/5] END .............C=1e-05, l1_ratio=0.2;, score=0.570 total time=   0.0s
[CV 5/5] END .............C=1e-05, l1_ratio=0.2;, score=0.570 total time=   0.0s
[CV 1/5] END .............C=1e-05, l1_ratio=0.4;, score=0.570 total time=   0.0s
[CV 2/5] END .............C=1e-05, l1_ratio=0.4

GridSearchCV(estimator=LogisticRegression(max_iter=10000, penalty='elasticnet',
                                          solver='saga'),
             param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]},
             verbose=8)

In [None]:
clf.cv_results_

{'mean_fit_time': array([4.33831973, 0.03424616, 0.03567758, 0.03633251, 0.03265805,
        0.00617938, 0.43893971, 0.05525279, 0.02367101, 0.02308707,
        0.01694655, 0.00626683, 0.16498985, 0.03520236, 0.02527881,
        0.01218023, 0.01199293, 0.01062803, 0.06663151, 0.07907577,
        0.02753706, 0.02548952, 0.02452688, 0.02351961, 0.05721574,
        0.29757228, 0.14894733, 0.09265552, 0.07607145, 0.06478853,
        0.06270533, 2.69016237, 0.91057253, 0.60573001, 0.72980862,
        0.79112759]),
 'mean_score_time': array([0.00085006, 0.00065002, 0.00068202, 0.00073719, 0.0006875 ,
        0.00055418, 0.00082407, 0.0007081 , 0.00057721, 0.00063725,
        0.00059443, 0.00057654, 0.00081296, 0.00064654, 0.00061574,
        0.00053616, 0.00053525, 0.00052004, 0.00067282, 0.00070577,
        0.00060897, 0.0005971 , 0.00079021, 0.00064621, 0.00076513,
        0.00082326, 0.00079675, 0.00096035, 0.00082679, 0.00081115,
        0.00080619, 0.00085959, 0.00084605, 0.00080686, 0.

In [None]:
clf.best_score_

0.6880417910911361

In [None]:
clf.best_params_

{'C': 1, 'l1_ratio': 0}

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100],
              'gamma': [0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf', 'linear']}
svm = svm.SVC()
clf = GridSearchCV(svm, param_grid, verbose = 5)
clf.fit(x_train, Y)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.570 total time=   7.9s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.571 total time=   7.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.571 total time=   7.7s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.570 total time=   7.7s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.570 total time=   7.9s
[CV 1/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.657 total time=   6.8s
[CV 2/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.652 total time=   6.9s
[CV 3/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.628 total time=   6.9s
[CV 4/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.641 total time=   7.0s
[CV 5/5] END ...C=0.1, gamma=0.1, kernel=linear;, score=0.729 total time=   7.0s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.570 total time=   7.7s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 10, 100],
                         'gamma': [0.1, 0.01, 0.001, 0.0001],
                         'kernel': ['rbf', 'linear']},
             verbose=5)

In [None]:
clf.cv_results_

{'mean_fit_time': array([6.40493445, 5.6417491 , 6.34643669, 5.68657789, 6.30245152,
        5.67310791, 6.31046958, 5.73208518, 5.97121248, 4.93859138,
        6.54622622, 4.93134656, 6.45169287, 4.95906544, 6.38460007,
        4.97153788, 6.63813639, 7.21869564, 5.96417341, 7.19644732,
        6.58192391, 7.22077875, 6.53332729, 7.20373111, 7.76366792,
        7.34228544, 7.04257636, 7.31776237, 5.96022887, 7.31590872,
        6.64047542, 7.2982048 ]),
 'mean_score_time': array([1.3870944 , 1.26652832, 1.39330344, 1.26482987, 1.38143954,
        1.27042389, 1.37322521, 1.28495569, 1.2689034 , 1.03741856,
        1.41938171, 1.0427968 , 1.40571504, 1.04972863, 1.39564595,
        1.05020504, 1.16680465, 1.05336127, 1.27257361, 1.05243344,
        1.44319634, 1.05370626, 1.42938995, 1.05231147, 1.1901134 ,
        1.04553905, 1.12876396, 1.03786802, 1.28021193, 1.0411612 ,
        1.44850602, 1.04008055]),
 'mean_test_score': array([0.5703402 , 0.66124359, 0.5703402 , 0.66124359, 0.570

In [None]:
clf.best_estimator_

SVC(C=100, gamma=0.001)

In [None]:
clf.best_score_

0.6827877063303547

In [None]:
clf.best_params_

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

model1 = LogisticRegression(C = 1, l1_ratio = 0, max_iter = 10000, penalty = 'elasticnet', solver = 'saga')
model2 = ComplementNB(alpha = 10)
model3 = SVC(C = 100, gamma = 0.001, kernel = 'rbf')
# model4 = RandomForestClassifier(n_estimators=500)

In [None]:
final_model = VotingClassifier(estimators=[('LR', model1), ('NB', model2), ('SVC',model3)],  voting='hard')
final_model.fit(x_train, Y)

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1, l1_ratio=0,
                                                 max_iter=10000,
                                                 penalty='elasticnet',
                                                 solver='saga')),
                             ('NB', ComplementNB(alpha=10)),
                             ('SVC', SVC(C=100, gamma=0.001))])

In [None]:
pred = final_model.predict(x_test)

## filling submission.csv
submission = pd.read_csv("https://raw.githubusercontent.com/pj423/Disaster-Tweets/main/sample_submission.csv")

submission["target"] = pred
# submission.to_csv("voting_ensemble.csv", index=False)

submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1


In [None]:
import sklearn.metrics

  sklearn.metrics.f1_score(correct_predictions["target"], submission["target"], average='macro')

0.7728059085207355