<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Disaster Tweets: Logistic Regression</h1>
</div>

Problem Type: Binary-classification

## Evaluation Metric

F1-Score

In [1]:
# Black formatter https://black.readthedocs.io/en/stable/

! pip install nb-black > /dev/null

%load_ext lab_black

[0m

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Configuration</h1>
</div>

In [2]:
data_dir = "../input/nlp-getting-started/"

In [3]:
TARGET = "target"

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Import Libraries</h1>
</div>

A best practise is to include all libraries here.  However, I will put a few imports farther down where they are first used so beginners can learn with an "as needed" approach.

In [4]:
import os
import time
from pathlib import Path

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import f1_score

# Visualization Libraries
import matplotlib.pylab as plt
import seaborn as sns

import nltk
from nltk.corpus import stopwords

from sklearn.linear_model import LogisticRegression

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Library</h1>
</div>

Creating a few functions that we will reuse in each project.

In [5]:
def read_data(path):
    data_dir = Path(path)

    train = pd.read_csv(data_dir / "train.csv")
    test = pd.read_csv(data_dir / "test.csv")
    submission_df = pd.read_csv(data_dir / "sample_submission.csv")

    print(f"train data: Rows={train.shape[0]}, Columns={train.shape[1]}")
    print(f"test data : Rows={test.shape[0]}, Columns={test.shape[1]}")
    return train, test, submission_df

In [6]:
def create_submission(model_name, target, preds, is_log_target=False):
    if is_log_target:
        #         preds = np.exp(preds)
        preds = np.expm1(preds)

    sample_submission[target] = preds
    if len(model_name) > 0:
        sample_submission.to_csv(f"submission_{model_name}.csv", index=False)
    else:
        sample_submission.to_csv(f"submission.csv", index=False)

    return sample_submission

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score


def show_scores(gt, yhat):
    f1 = f1_score(gt, yhat, average="macro")
    mse = mean_squared_error(gt, yhat)
    mae = mean_absolute_error(gt, yhat)
    rmse = np.sqrt(mse)
    #     r2_square = r2_score(gt, yhat)

    print(f"f1: {f1:.4f}")

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Load Train/Test Data</h1>
</div>

- train.csv - Data used to build our machine learning model
- test.csv - Data used to build our machine learning model. Does not contain the target variable
- sample_submission.csv - A file in the proper format to submit test predictions

In [8]:
train, test, sample_submission = read_data(data_dir)

train data: Rows=7613, Columns=5
test data : Rows=3263, Columns=4


In [9]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Clean Data</h1>
</div>

In [10]:
import re
import string


def remove_URL(text):
    url = re.compile(r"https?://\S+")
    return url.sub(r" httpsmark ", text)


def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)


def remove_atsymbol(text):
    name = re.compile(r"@\S+")
    return name.sub(r" atsymbol ", text)


def remove_hashtag(text):
    hashtag = re.compile(r"#")
    return hashtag.sub(r" hashtag ", text)


def remove_exclamation(text):
    exclamation = re.compile(r"!")
    return exclamation.sub(r" exclamation ", text)


def remove_question(text):
    question = re.compile(r"?")
    return question.sub(r" question ", text)


def remove_punc(text):
    return text.translate(str.maketrans("", "", string.punctuation))


def remove_number(text):
    number = re.compile(r"\d+")
    return number.sub(r" number ", text)


def remove_emoji(string):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F1E0-\U0001F1FF"  # flags (iOS)
        "\U00002500-\U00002BEF"  # chinese char
        "\U00002702-\U000027B0"
        "\U00002702-\U000027B0"
        "\U000024C2-\U0001F251"
        "\U0001f926-\U0001f937"
        "\U00010000-\U0010ffff"
        "\u2640-\u2642"
        "\u2600-\u2B55"
        "\u200d"
        "\u23cf"
        "\u23e9"
        "\u231a"
        "\ufe0f"  # dingbats
        "\u3030"
        "]+",
        flags=re.UNICODE,
    )
    return emoji_pattern.sub(r" emoji ", string)

In [11]:
train["text"] = train["text"].str.lower()
train["text"] = train["text"].apply(lambda text: remove_URL(text))
train["text"] = train["text"].apply(lambda text: remove_html(text))
train["text"] = train["text"].apply(lambda text: remove_atsymbol(text))
train["text"] = train["text"].apply(lambda text: remove_hashtag(text))
train["text"] = train["text"].apply(lambda text: remove_exclamation(text))
train["text"] = train["text"].apply(lambda text: remove_punc(text))
train["text"] = train["text"].apply(lambda text: remove_number(text))
train["text"] = train["text"].apply(lambda text: remove_emoji(text))


test["text"] = test["text"].str.lower()
test["text"] = test["text"].apply(lambda text: remove_URL(text))
test["text"] = test["text"].apply(lambda text: remove_html(text))
test["text"] = test["text"].apply(lambda text: remove_atsymbol(text))
test["text"] = test["text"].apply(lambda text: remove_hashtag(text))
test["text"] = test["text"].apply(lambda text: remove_exclamation(text))
test["text"] = test["text"].apply(lambda text: remove_punc(text))
test["text"] = test["text"].apply(lambda text: remove_number(text))
test["text"] = test["text"].apply(lambda text: remove_emoji(text))

In [12]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,our deeds are the reason of this hashtag eart...,1
1,4,,,forest fire near la ronge sask canada,1
2,5,,,all residents asked to shelter in place are be...,1
3,6,,,number people receive hashtag wildfires eva...,1
4,7,,,just got sent this photo from ruby hashtag al...,1


In [13]:
def clean_text(text):
    """Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers."""
    text = text.lower()
    text = re.sub("\[.*?\]", "", text)
    text = re.sub("https?://\S+|www\.\S+", "", text)
    text = re.sub("<.*?>+", "", text)
    text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub("\n", "", text)
    text = re.sub("\w*\d\w*", "", text)
    return text

In [14]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words("english")]
    return words


# train["text"] = train["text"].apply(lambda x: remove_stopwords(x))
# test["text"] = test["text"].apply(lambda x: remove_stopwords(x))
# train.head()

- https://www.kaggle.com/code/parulpandey/getting-started-with-nlp-a-general-intro
- https://www.kaggle.com/code/saipkb86/disaster-tweets-logistic-naive
- https://www.kaggle.com/code/rhodiumbeng/logistic-regression-baseline

In [15]:
def text_preprocessing(text):
    """
    Cleaning and parsing the text.

    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")

    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    remove_stopwords = [
        w for w in tokenized_text if w not in stopwords.words("english")
    ]
    combined_text = " ".join(remove_stopwords)
    return combined_text

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# count_vectorizer = CountVectorizer()
# train_vectors = count_vectorizer.fit_transform(train['text'])
# test_vectors = count_vectorizer.transform(test["text"])

# ## Keeping only non-zero elements to preserve space
# print(train_vectors[0].todense())

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Features</h1>
</div>

In [17]:
FEATURES = [
    col
    for col in train.columns
    if col
    not in [
        "Id",
        TARGET,
        #         TARGET_ENC,
    ]
]

In [18]:
y = train[TARGET]
X = train[FEATURES].copy()

X_test = test[FEATURES].copy()

Tokenize then pad sequence.

### Vectorization vs Tokenization

- https://datascience.stackexchange.com/questions/104859/what-is-the-difference-between-textvectorization-and-tokenizer
- https://stackoverflow.com/questions/71002866/difference-between-tokenizer-and-textvectorization-layer-in-tensorflow
- [You should try the new TensorFlow’s TextVectorization layer](https://towardsdatascience.com/you-should-try-the-new-tensorflows-textvectorization-layer-a80b3c6b00ee)

In [19]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.1,
    random_state=42,
)
X_train.shape, y_train.shape, X_valid.shape, y_valid.shape

((6851, 4), (6851,), (762, 4), (762,))

In [20]:
type(X_train)

pandas.core.frame.DataFrame

In [21]:
X_train.head()

Unnamed: 0,id,keyword,location,text
4620,6568,injury,"Plano, Texas",mcfadden reportedly to test hamstring thursday...
2858,4107,drought,Nigeria,w nema warns nigerians to prepare for drought ...
3098,4448,electrocuted,,when i was cooking earlier i got electrocuted ...
3751,5330,fire,Canada,im on fire httpsmark
5285,7552,outbreak,Indonesia,more than number families affected by the fa...


In [22]:
X_train = X_train.text
X_valid = X_valid.text
X_test = X_test.text

In [23]:
X_train.head()

4620    mcfadden reportedly to test hamstring thursday...
2858    w nema warns nigerians to prepare for drought ...
3098    when i was cooking earlier i got electrocuted ...
3751                              im on fire   httpsmark 
5285    more than  number  families affected by the fa...
Name: text, dtype: object

In [24]:
count_vectorizer = CountVectorizer(stop_words="english")

X_train_dtm = count_vectorizer.fit_transform(X_train)
X_valid_dtm = count_vectorizer.transform(X_valid)
X_test_dtm = count_vectorizer.transform(X_test)


X_train_dtm, X_valid_dtm

(<6851x13561 sparse matrix of type '<class 'numpy.int64'>'
 	with 63791 stored elements in Compressed Sparse Row format>,
 <762x13561 sparse matrix of type '<class 'numpy.int64'>'
 	with 6350 stored elements in Compressed Sparse Row format>)

<div style="background-color:rgba(128, 0, 128, 0.6);border-radius:5px;display:fill"><h1 style="text-align: center;padding: 12px 0px 12px 0px;">Create Models</h1>
</div>

In [25]:
model = LogisticRegression(C=1.0, random_state=42)
model.fit(X_train_dtm, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

## Predict and Score

In [26]:
# model.evaluate(X_valid, y_valid)

In [27]:
valid_preds = model.predict(X_valid_dtm)
valid_preds[:5]

array([0, 0, 0, 0, 0])

In [28]:
valid_preds[:5]

array([0, 0, 0, 0, 0])

In [29]:
valid_preds.squeeze()[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [30]:
valid_preds.ravel()[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1])

In [31]:
preds = model.predict(X_test_dtm).ravel()
preds[:5]

array([1, 1, 1, 1, 1])

In [32]:
preds = (preds > 0.5).astype(int)
np.unique(preds)

array([0, 1])

## Submission

In [33]:
# Different TARGET name used in test.csv
ss = create_submission("", TARGET, preds)
ss[:5]

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [34]:
ss[TARGET].value_counts()

0    2094
1    1169
Name: target, dtype: int64