In [None]:
import numpy as np
import pandas as pd
import spacy
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../input/disaster-tweets/tweets.csv")
data.head()

FileNotFoundError: ignored

In [None]:
data.text[0]

In [None]:
data.shape

# EDA

## 1. Number of characters in tweets

In [None]:
data["numberOfchars"] = data.text.str.len()

In [None]:
data.numberOfchars.hist(figsize=(14,8))

In [None]:
plt.figure(figsize=(14,8))
sns.histplot(x="numberOfchars", data=data, hue="target", kde=True)

### Remarks:
- All tweets have between 20 - 140 characters
- The tweets referencing real disasters often have between 100 - 130 characters

## 2. Number of words

In [None]:
data["numberOfWords"] = data.text.str.split().map(lambda x: len(x))
#data.text.str.split().map( lambda x : len(x)).hist(figsize=(14,8))

In [None]:
plt.figure(figsize=(14,8))
sns.histplot(x="numberOfWords", data=data, hue="target", kde=True)

### Remarks:
- All tweets have between 3 and 30 words, but in generally have about 15-25

## 3. Location

In [None]:
data.location.value_counts()

In [None]:
data.location.isnull().value_counts()

## 3. Target

In [None]:
plt.figure(figsize=(12,8))
sns.countplot(x="target", data=data)

# TEXT PROCESSING

In [None]:
#preprocessing

import string
import re

def remove_url(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)

def remove_ablazeWord(text):
    return text.replace("ablaze", "")

def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)

def remove_tripleDot(text):
    return text.replace('\u2026', "")

def remove_emojis(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [None]:
data["text"] = data.text.map(remove_url)
data["text"] = data.text.map(remove_punct)
data["text"] = data.text.map(remove_tripleDot)
data["text"] = data.text.map(remove_emojis)
data["text"] = data.text.map(remove_ablazeWord)

In [None]:
#remove stop word

from spacy.lang.en import STOP_WORDS

stopwords = set(STOP_WORDS)

nlp = spacy.load('en_core_web_sm')

def remove_stopword(text):
    filtered_text = [word.lemma_.lower() for word in nlp.tokenizer(text) if not word.text.lower() in stopwords]
    return " ".join(filtered)

In [None]:
data["text"] = data.text.map(remove_stopword)

# MODEL

## 1. SVM

In [None]:
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB

In [None]:
X = data.text
y = data["target"]

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), max_df=0.9, min_df=10)
tfidf_matrix = tfidf_vectorizer.fit_transform(X)
dense = tfidf_matrix.todense()
dense.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
svm = make_pipeline(TfidfVectorizer(ngram_range=(1, 2)),TfidfTransformer(), SVC(kernel="linear", gamma="auto", C=2, random_state=0))

In [None]:
def evaluate(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(confusion_matrix(y_true=y_test, y_pred=y_pred))
    print(classification_report(y_true=y_test, y_pred=y_pred))

    # N, train_score, val_score = learning_curve(model, X_train, y_train, cv=4)
    # plt.figure(figsize=(12,8))
    # plt.plot(N, train_score.mean(axis=1), label='train score')
    # plt.plot(N, val_score.mean(axis=1), label='validation score')
    # plt.legend()

In [None]:
evaluate(svm)

## 2.Build NN for text classification

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.layers import Dense, Dropout, concatenate, Input, Embedding, SpatialDropout1D, GlobalAvgPool1D, GlobalMaxPool1D
from tensorflow.keras.models import Model

In [None]:
max_length = 50
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

tokenizer = Tokenizer(oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding= padding_type, truncating=trunc_type)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding= padding_type, truncating=trunc_type)

In [None]:
vocab_size= len(word_index) + 1
embedding_dim = 16

In [None]:
vocab_size

In [None]:
train_padded = np.array(train_padded)
test_padded = np.array(test_padded)
train_labels = np.array(y_train)
test_labels = np.array(y_test)

In [None]:
print(train_padded.shape)
print(test_padded.shape)
print(train_labels.shape)

In [None]:
model = keras.models.Sequential([
    keras.layers.Embedding(input_dim=vocab_size,output_dim= embedding_dim, input_length=max_length),
    keras.layers.GlobalAvgPool1D(),
    keras.layers.Dense(32, activation='relu'),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dense(1, activation='sigmoid')
])

optRm = keras.optimizers.RMSprop(lr=0.001, rho=0.9)

model.compile(loss="binary_crossentropy", optimizer=optRm, metrics=["accuracy"])

model.summary()


In [None]:
history = model.fit(train_padded, train_labels, validation_split=.1, epochs=20, batch_size=32)

In [None]:
import matplotlib.pyplot as plt
pd.DataFrame(history.history).plot(figsize=(14,8))
plt.grid(True)

In [None]:
y_pred = model.predict(test_padded)
y_final = np.array([1 if pred > .5 else 0 for pred in y_pred])
y_final

In [None]:
print(confusion_matrix(y_true=y_test, y_pred=y_final))
print(classification_report(y_true=y_test, y_pred=y_final))