In [6]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

## Read in Data

In [7]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample = pd.read_csv('data/sample_submission.csv')

## EDA

In [8]:
pd.DataFrame({'count': train.target.value_counts(), 
              'percentage': train.target.value_counts(normalize=True)})

Unnamed: 0,count,percentage
0,4342,0.57034
1,3271,0.42966


In [9]:
train["target"].value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x128657e10>

give keyords distinct numbers

In [10]:
def map_keywords(series):
    mapper = {}
    u_series = series.unique()
    for i in range(len(u_series)):
        mapper[u_series[i]] = i
        
    return mapper

train_keyword_map = map_keywords(train.keyword)

train['keyword_num'] = train['keyword'].map(train_keyword_map)
test['keyword_num'] = test['keyword'].map(train_keyword_map)

In [11]:
def text_preprocessing(data):
    #remove whitespace and lower all words
    data = data.apply(lambda x: x.strip().lower())
    #replace digits
    data = data.apply(lambda x: re.sub(r'\d+', '', x))
    #replace punctuation
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #tokenize
    data = data.apply(lambda x : word_tokenize(x))
    #filter out stopwords
    data = data.apply(lambda x: [word for word in x if word not in stop_words])
    #remove inflection and return base word
    lemmatizer = WordNetLemmatizer()
    data = data.apply(lambda x: [lemmatizer.lemmatize(word, pos ='v') for word in x])
    #parts of speech tagging
    #data = data.apply(lambda x: [pos_tag(x)])
    return data

In [12]:
train['pro_text'] = text_preprocessing(train.text)
test['pro_text'] = text_preprocessing(test.text)

In [13]:
train.head()

Unnamed: 0,id,keyword,location,text,target,keyword_num,pro_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0,"[deeds, reason, earthquake, may, allah, forgiv..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,0,"[forest, fire, near, la, ronge, sask, canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,0,"[residents, ask, shelter, place, notify, offic..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0,"[people, receive, wildfires, evacuation, order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0,"[get, send, photo, ruby, alaska, smoke, wildfi..."


## TFIDF Vectorizer
Matrix of token counts with TF-IDF transformation

In [33]:
vectorizer = TfidfVectorizer()
#joining words and fit transofrming
vector = vectorizer.fit_transform(["".join(i) for i in train["pro_text"]])
vector = vector.todense()
vector = np.concatenate((vector, np.reshape(np.array(train["keyword_num"]), (train.keyword.shape[0],-1))), axis=1)
print(vector.shape)

# vector_test = vectorizer.fit_transform(["".join(i) for i in test["text"]])
vector_test = vectorizer.transform(["".join(i) for i in test["pro_text"]])
vector_test = vector_test.todense()
vector_test = np.concatenate((vector_test, np.reshape(np.array(test["keyword_num"]), (test.keyword.shape[0],-1))), axis=1)
print(vector_test.shape)

(7613, 8037)
(3263, 8037)


In [34]:
xtest

matrix([[0.0, 0.0, 0.0, ..., 0.0, 0.0, 'deluged'],
        [0.0, 0.0, 0.0, ..., 0.0, 0.0, 'violent%20storm'],
        [0.0, 0.0, 0.0, ..., 0.0, 0.0, 'hazard'],
        ...,
        [0.0, 0.0, 0.0, ..., 0.0, 0.0, 'landslide'],
        [0.0, 0.0, 0.0, ..., 0.0, 0.0, 'deaths'],
        [0.0, 0.0, 0.0, ..., 0.0, 0.0, 'detonation']], dtype=object)

split train and test data

In [35]:
xtrain, xtest, ytrain, ytest = train_test_split(vector, train['target'], train_size = 0.75)



In [39]:
from sklearn.svm import LinearSVC,SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

model = LinearSVC(loss="hinge",fit_intercept=False, max_iter=1500)
model = model.fit(xtrain, ytrain) 
predictions = model.predict(xtest)

print("Accuracy score: ", accuracy_score(ytest, predictions))
print("Precision score: ", precision_score(ytest, predictions))
print("Recall score: ", recall_score(ytest, predictions))
print("F1 score : ", f1_score(predictions, ytest))

confusion_matrix(ytest, predictions)

Accuracy score:  0.5724789915966386
Precision score:  0.8888888888888888
Recall score:  0.01932367149758454
F1 score :  0.03782505910165485




array([[1074,    2],
       [ 812,   16]])

In [40]:
# accuracy - TP + TN / (TP + TN + FP + FN)
# precision - TP / (TP + FP)
# recall - TP / (TP + FN)
# f1 score - 2  * (precision * recall ) / (precision + recall)

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model_lr = LogisticRegression(penalty='l2')
model_lr = model_lr.fit(xtrain, ytrain) 
predictions = model_lr.predict(xtest)

print("Accuracy score: ", accuracy_score(ytest, predictions))
print("Precision score: ", precision_score(ytest, predictions))
print("Recall score: ", recall_score(ytest, predictions))
print("F1 score : ", f1_score(predictions, ytest))

confusion_matrix(ytest, predictions)



Accuracy score:  0.5787815126050421
Precision score:  0.9333333333333333
Recall score:  0.033816425120772944
F1 score :  0.06526806526806526


array([[1074,    2],
       [ 800,   28]])

In [43]:
from xgboost import XGBClassifier

model_xgb = XGBClassifier(metric= 'euclidean', n_neighbors= 3, weights= 'uniform')
model_xgb = model_xgb.fit(xtrain, ytrain) 
predictions = model_xgb.predict(xtest)

print("Accuracy score: ", accuracy_score(ytest, predictions))
print("Precision score: ", precision_score(ytest, predictions))
print("Recall score: ", recall_score(ytest, predictions))
print("F1 score : ", f1_score(predictions, ytest))

confusion_matrix(ytest, predictions)

Accuracy score:  0.6685924369747899
Precision score:  0.7296037296037297
Recall score:  0.3780193236714976
F1 score :  0.49801113762927607


array([[960, 116],
       [515, 313]])

In [46]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
tree = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)

model_ada = AdaBoostClassifier(base_estimator=tree)
model_ada = model_ada.fit(xtrain, ytrain)
predictions = model_ada.predict(xtest)

print("Accuracy score: ", accuracy_score(ytest, predictions))
print("Precision score: ", precision_score(ytest, predictions))
print("Recall score: ", recall_score(ytest, predictions))
print("F1 score : ", f1_score(predictions, ytest))

confusion_matrix(ytest, predictions)

Accuracy score:  0.6465336134453782
Precision score:  0.6962025316455697
Recall score:  0.3321256038647343
F1 score :  0.4497138184791496


array([[956, 120],
       [553, 275]])

In [47]:
model_gb = GradientBoostingClassifier(criterion='friedman_mse', learning_rate= 0.15, 
                                   loss= 'deviance', max_depth= 8, max_features='sqrt', 
                                   min_samples_leaf= 0.15714285714285714, min_samples_split= 0.5, 
                                   n_estimators= 10, subsample=1.0)
model_gb = model_gb.fit(xtrain, ytrain)
predictions = model_gb.predict(xtest)

print("Accuracy score: ", accuracy_score(ytest, predictions))
print("Precision score: ", precision_score(ytest, predictions))
print("Recall score: ", recall_score(ytest, predictions))
print("F1 score : ", f1_score(predictions, ytest))

confusion_matrix(ytest, predictions)

Accuracy score:  0.5651260504201681
Precision score:  0.0
Recall score:  0.0
F1 score :  0.0


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


array([[1076,    0],
       [ 828,    0]])

In [53]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

In [56]:
import tokenization

In [57]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [58]:
%%time
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

AttributeError: module 'tensorflow_hub' has no attribute 'KerasLayer'

In [61]:
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1', trainable=True)

AttributeError: module 'tensorflow_hub' has no attribute 'KerasLayer'