# Disaster Tweets

In [1]:
# Black formatter https://black.readthedocs.io/en/stable/

! pip install nb-black > /dev/null

%load_ext lab_black

[0m

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
TARGET = "target"

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

A best practise is to include all libraries here.  However, I will put a few imports farther down where they are first used so beginners can learn with an "as needed" approach.

In [3]:
import os
import time
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

Creating a few functions that we will reuse in each project.

In [4]:
def read_data(path):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    print(f"train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
    print(f"test data : Rows={test.shape[0]}, Columns={test.shape[1]}")
    return train, test, submission_df

In [5]:
def create_submission(model_name, target, preds, is_log_target=False):
    if is_log_target:
        #         preds = np.exp(preds)
        preds = np.expm1(preds)

    sample_submission[target] = preds
    if len(model_name) > 0:
        sample_submission.to_csv(f"submission_{model_name}.csv", index=False)
    else:
        sample_submission.to_csv(f"submission.csv", index=False)

    return sample_submission[:5]

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


def show_scores(gt, yhat):
    accuracy = accuracy_score(gt, yhat)
    precision = precision_score(gt, yhat)
    recall = recall_score(gt, yhat)
    f1 = f1_score(gt, yhat)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"f1: {f1:.4f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

- train.csv - Data used to build our machine learning model
- test.csv - Data used to build our machine learning model. Does not contain the target variable
- sample_submission.csv - A file in the proper format to submit test predictions

In [7]:
train, test, sample_submission = read_data("../input/nlp-getting-started/")

train data: Rows=7613, Columns=5
test data : Rows=3263, Columns=4


In [8]:
train.sample(5)

Unnamed: 0,id,keyword,location,text,target
4228,6003,hazardous,United States,JKL issues Hazardous Weather Outlook (HWO) ht...,1
2341,3368,demolition,,No civilian population ever deserves demolitio...,1
2039,2927,danger,Spinning through time.,@riverroaming 'And not too much danger please.',0
6498,9290,sunk,,The Seven Seas - Wreck of the Giannis D. sunk ...,1
4218,5991,hazardous,,http://t.co/7AzE4IoGMe Risk Assessment and Opt...,0


In [9]:
FEATURES = ["text"]

y = train[TARGET]
X = train[FEATURES].copy()

X_test = test[FEATURES].copy()

In [10]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,  # Save 20% for validation
    random_state=42,  # Make the split deterministic
)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((6090, 1), (6090,), (1523, 1), (1523,))

In [11]:
import nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfidf.fit(X_train.text)

train_sequences = tfidf.transform(X_train.text)
valid_sequences = tfidf.transform(X_valid.text)
test_sequences = tfidf.transform(X_test.text)

In [12]:
from sklearn import naive_bayes

model = naive_bayes.MultinomialNB()
model.fit(train_sequences, y_train)

valid_preds = model.predict(valid_sequences)
show_scores(y_valid, valid_preds)

Accuracy: 0.7932
Precision: 0.8678
Recall: 0.6071
f1: 0.7144


In [13]:
np.unique(valid_preds)

array([0, 1])

In [14]:
test_preds = model.predict(test_sequences)

In [15]:
create_submission("", TARGET, test_preds)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,0
3,9,1
4,11,1
