Submission to Natural Language Processing with Disaster Tweets

August 1, 2023

Competition link: https://www.kaggle.com/competitions/nlp-getting-started/overview

Code references personal notes from https://www.educative.io/courses/mastering-spacy/

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [2]:
import random
import spacy
from spacy.training import Example
from spacy.pipeline.textcat import DEFAULT_SINGLE_TEXTCAT_MODEL

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [4]:
train_df[pd.notna(train_df['keyword'])].head()

Unnamed: 0,id,keyword,location,text,target
31,48,ablaze,Birmingham,@bbcmtd Wholesale Markets ablaze http://t.co/l...,1
32,49,ablaze,Est. September 2012 - Bristol,We always try to bring the heavy. #metal #RT h...,0
33,50,ablaze,AFRICA,#AFRICANBAZE: Breaking news:Nigeria flag set a...,1
34,52,ablaze,"Philadelphia, PA",Crying out for more! Set me ablaze,0
35,53,ablaze,"London, UK",On plus side LOOK AT THE SKY LAST NIGHT IT WAS...,0


In [5]:
train_df.shape

(7613, 5)

In [6]:
test_df[pd.notna(test_df['keyword'])].head()

Unnamed: 0,id,keyword,location,text
15,46,ablaze,London,Birmingham Wholesale Market is ablaze BBC News...
16,47,ablaze,Niall's place | SAF 12 SQUAD |,@sunkxssedharry will you wear shorts for race ...
17,51,ablaze,NIGERIA,#PreviouslyOnDoyinTv: Toke MakinwaÛªs marriag...
18,58,ablaze,Live On Webcam,Check these out: http://t.co/rOI2NSmEJJ http:/...
19,60,ablaze,"Los Angeles, Califnordia",PSA: IÛªm splitting my personalities.\n\n?? t...


In [7]:
# https://www.geeksforgeeks.org/divide-a-dataframe-in-a-ratio/
train_df_part = train_df.sample(frac = 0.8)
val_df = train_df.drop(train_df_part.index)
train_df_part.shape

(6090, 5)

In [8]:
val_df.shape

(1523, 5)

### Create training dataset

In [9]:
import re
def text_preprocess(text):
    text = text.lower()
    text = re.sub('[^a-z\s]', '', text)
    return text.strip()

In [10]:
def df_to_spacy_train(df):
    train_data = []
    for ind, row in df.iterrows():
        dic = {'positive': 1, 'negative': 0} if row['target'] else {"positive": 0, "negative": 1}
        train_data.append((text_preprocess(row['text']), {'cats': dic}))
    return train_data

In [11]:
train_data = df_to_spacy_train(train_df_part)
train_data[:2]

[('thankkk u all soo much for flooding my notificationsu my fella parsholics r superb amp jus soo awesomelove love u all always frvrgrateful',
  {'cats': {'positive': 0, 'negative': 1}}),
 ('byuwnbeki the sad eyes and tacit stories in your heart that night in which the whirlwind was raging',
  {'cats': {'positive': 1, 'negative': 0}})]

In [12]:
nlp = spacy.load("en_core_web_sm")
config = {
    "threshold": 0.5,
    "model": DEFAULT_SINGLE_TEXTCAT_MODEL
}
textcat = nlp.add_pipe("textcat", config=config)

In [13]:
textcat.add_label("positive")
textcat.add_label("negative")

# initializes the TextCategorizer model's weights with the training examples
train_examples = [Example.from_dict(nlp.make_doc(text), label) for text, label in train_data]
textcat.initialize(lambda: train_examples, nlp=nlp)

### Train

In [14]:
epochs=5

with nlp.select_pipes(enable="textcat"):
    optimizer = nlp.resume_training()
    
    for i in range(epochs):
        random.shuffle(train_data)
        for text, label in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, label)
            nlp.update([example], sgd=optimizer)

In [15]:
val_df.head()

Unnamed: 0,id,keyword,location,text,target
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
10,16,,,Three people died from the heat wave so far,1
12,18,,,#raining #flooding #Florida #TampaBay #Tampa 1...,1
22,33,,,Love skiing,0


In [16]:
def test_model(df):
    predictions = []
    for ind, row in df.iterrows():
        doc = nlp(text_preprocess(row['text']))
        pred = 1 if doc.cats['positive'] > doc.cats['negative'] else 0
        predictions.append(pred)
    df['prediction'] = predictions
    return df

In [17]:
prediction = test_model(val_df)

In [18]:
prediction.head()

Unnamed: 0,id,keyword,location,text,target,prediction
3,6,,,"13,000 people receive #wildfires evacuation or...",1,1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0
10,16,,,Three people died from the heat wave so far,1,1
12,18,,,#raining #flooding #Florida #TampaBay #Tampa 1...,1,0
22,33,,,Love skiing,0,0


In [19]:
# https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html
tp = len(prediction[(prediction['target']==1) & (prediction['prediction']==1)])
tp

429

In [20]:
tn = len(prediction[(prediction['target']==0) & (prediction['prediction']==0)])
tn

724

In [21]:
fp = len(prediction[(prediction['target']==0) & (prediction['prediction']==1)])
fp

139

In [22]:
fn = len(prediction[(prediction['target']==1) & (prediction['prediction']==0)])
fn

231

In [23]:
precision = tp/(tp+fp)
recall = tp/(tp+fn)
accuracy = (tp+tn)/len(prediction)
f1 = 2*precision*recall/(precision+recall)

In [24]:
print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('f1: ', f1)

precision:  0.7552816901408451
recall:  0.65
accuracy:  0.7570584372948129
f1:  0.6986970684039088


#### preprocess only the val set:
    precision:  0.7044917257683215
    recall:  0.9254658385093167
    accuracy:  0.804333552199606
    f1:  0.7999999999999999
#### no preprocess at all:
    precision:  0.7813299232736572
    recall:  0.9487577639751553
    accuracy:  0.866053841103086
    f1:  0.8569424964936886