## Data Preprocessing

In [1]:
# Import libraries
import numpy as np # linear algebra
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import re #regular expressions
from textblob import TextBlob
import spacy
import spacy_curated_transformers
import spacy_transformers
# spacy.load('en_core_web_md')

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
# Load data from a CSV file into a DataFrame
df_train = pd.read_csv('../input/nlp-getting-started/train.csv')
df_test = pd.read_csv('../input/nlp-getting-started/test.csv')
# Concatenate both training and test data
data = pd.concat([df_train, df_test]).reset_index(drop=True)

In [3]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

#Function to remove irrelevant data
def clean_tweet(tweet):

    """
    Regex expressions website https://regex101.com/
    """

    # Capital letters to lowercase
    tweet = tweet.lower()
    # Remove URLs
    tweet = re.sub(r'http\S+', '', tweet)
    # Remove user mentions
    tweet = re.sub(r'@[^\s]+', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    # Remove punctuation
    tweet = re.sub('[%s]' % re.escape(string.punctuation), '', tweet)
    # Remove #RT
    tweet = re.sub(r'\b(rt)\b','',tweet)
    # Remove words containing numbers
    tweet = re.sub('\w*\d\w*' , '', tweet)
    # Remove single characters
    tweet = re.sub(r'\b[a-zA-Z]\b','',tweet)
    # Reduce repeated characters (e.g., "goooooaaaaal" to "goal")
    tweet = re.sub(r'(.)\1{2,}', r'\1', tweet) # for sentiment analysis exaggerated characters may be essential !!!
    # Remove stopwords
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    # Apply lemmatization
    tweet = ' '.join([lemmatizer.lemmatize(word) for word in tweet.split()])  # Lemmatization

    return tweet

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anna_verbytska/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Replace tweets with clean text
data['text'] = [clean_tweet(tweet) for tweet in data['text']]
data.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive,1.0
1,4,,,forest fire near la ronge sask canada,1.0
2,5,,,resident asked shelter place notified officer ...,1.0
3,6,,,people receive wildfire evacuation order calif...,1.0
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1.0
5,8,,,rockyfire update california hwy closed directi...,1.0
6,10,,,flood disaster heavy rain cause flash flooding...,1.0
7,13,,,im top hill see fire wood,1.0
8,14,,,emergency evacuation happening building across...,1.0
9,15,,,im afraid tornado coming area,1.0


In [7]:
#Function to remove irrelevant data
def clean_location(location):

    """
    Regex expressions website https://regex101.com/
    """

    location = str(location)
    # Capital letters to lowercase
    location = location.lower()
    # Remove URLs
    location = re.sub(r'http\S+', '', location)
    # Remove user mentions
    location = re.sub(r'@[^\s]+', '', location)
    # Remove hashtags
    location = re.sub(r'#([^\s]+)', r'\1', location)
    # Remove punctuation
    location = re.sub('[%s]' % re.escape(string.punctuation), '', location)
    # Remove #RT
    location = re.sub(r'\b(rt)\b','',location)
    # Remove words containing numbers
    location = re.sub('\w*\d\w*' , '', location)
    # Remove single characters
    location = re.sub(r'\b[a-zA-Z]\b','',location)
    # Reduce repeated characters
    location = re.sub(r'(.)\1{2,}', r'\1', location) # for sentiment analysis exaggerated characters may be essential !!!
    # Remove stopwords
    location = ' '.join([word for word in location.split() if word not in stop_words])
    # Apply lemmatization
    location = ' '.join([lemmatizer.lemmatize(word) for word in location.split()])  # Lemmatization

    return location

In [8]:
# Replace tweets with clean text
data['location'] = [clean_location(location) for location in data['location']]
data['location'].unique()

array(['nan', 'birmingham', 'est september bristol', ..., 'love reiss',
       'acey mountain islanddåçtorontoåè', 'brussels belgium'],
      dtype=object)

In [9]:
data.head(5)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive,1.0
1,4,,,forest fire near la ronge sask canada,1.0
2,5,,,resident asked shelter place notified officer ...,1.0
3,6,,,people receive wildfire evacuation order calif...,1.0
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1.0


In [10]:
# Decode the URL-encoded string to clean keyword column
from urllib.parse import unquote
data['keyword'] = data['keyword'].apply(lambda x: unquote(x) if isinstance(x, str) else x)
data['keyword'].unique()

array([nan, 'ablaze', 'accident', 'aftershock', 'airplane accident',
       'ambulance', 'annihilated', 'annihilation', 'apocalypse',
       'armageddon', 'army', 'arson', 'arsonist', 'attack', 'attacked',
       'avalanche', 'battle', 'bioterror', 'bioterrorism', 'blaze',
       'blazing', 'bleeding', 'blew up', 'blight', 'blizzard', 'blood',
       'bloody', 'blown up', 'body bag', 'body bagging', 'body bags',
       'bomb', 'bombed', 'bombing', 'bridge collapse',
       'buildings burning', 'buildings on fire', 'burned', 'burning',
       'burning buildings', 'bush fires', 'casualties', 'casualty',
       'catastrophe', 'catastrophic', 'chemical emergency', 'cliff fall',
       'collapse', 'collapsed', 'collide', 'collided', 'collision',
       'crash', 'crashed', 'crush', 'crushed', 'curfew', 'cyclone',
       'damage', 'danger', 'dead', 'death', 'deaths', 'debris', 'deluge',
       'deluged', 'demolish', 'demolished', 'demolition', 'derail',
       'derailed', 'derailment', 'desol

In [11]:
# split the training and test data again
df_train = data[data.target != -1].reset_index(drop=True) 
df_test = data[data.target == -1].reset_index(drop=True)

#### Extract GPE with NER for location column to fill in missing values

In [12]:
# Download the model
# Documentation: https://spacy.io/models/en#en_core_web_trf
trf = spacy.load('en_core_web_trf', weights_only=True)

# Create the function to return a list of mentioned organizations
def get_entities(text):
    # process the text with a SpaCy model to get named entities
    doc = trf(text)
    # initialize list to store identified organizations
    gpe_list = []

# loop through the identified entities and append entities to lists
    for entity in doc.ents:
        if entity.label_ == 'GPE':
            return entity.text  # Return the first GPE found
    return None  # Return None if no GPE found

  model.load_state_dict(torch.load(filelike, map_location=device))


In [13]:
# Apply function to extract GPEs from tweets and fill in the missing values
df_train['location'] = df_train['location'].fillna(df_train['text'].apply(get_entities))  

In [14]:
df_train.head(20)

Unnamed: 0,id,keyword,location,text,target
0,1,,,deed reason earthquake may allah forgive,1.0
1,4,,,forest fire near la ronge sask canada,1.0
2,5,,,resident asked shelter place notified officer ...,1.0
3,6,,,people receive wildfire evacuation order calif...,1.0
4,7,,,got sent photo ruby alaska smoke wildfire pour...,1.0
5,8,,,rockyfire update california hwy closed directi...,1.0
6,10,,,flood disaster heavy rain cause flash flooding...,1.0
7,13,,,im top hill see fire wood,1.0
8,14,,,emergency evacuation happening building across...,1.0
9,15,,,im afraid tornado coming area,1.0


#### Keyword extraction with KeyBERT

https://www.geeksforgeeks.org/keyword-extraction-methods-in-nlp/

In [71]:
#Installation

from keybert import KeyBERT

# Initialize the KeyBERT model
model = KeyBERT('distilbert-base-nli-mean-tokens')

# Load the data
text = df_train['text']

# Extract keywords
keywords = model.extract_keywords(text)
# Function to extract keywords using KeyBERT
def extract_keywords(text):
    keywords = model.extract_keywords(text, top_n=3)  # Extract top 1 keyword
    return keywords[0][0] if keywords else None

# Print the keywords
print("Keywords:")
for keyword in keywords:
    print(keyword)

Keywords:
[('earthquake', 0.6271), ('allah', 0.58), ('forgive', 0.5527), ('deed', 0.3725), ('reason', 0.3257)]
[('forest', 0.6954), ('canada', 0.6463), ('ronge', 0.3226), ('la', 0.3119), ('sask', 0.2225)]
[('evacuation', 0.542), ('notified', 0.4864), ('officer', 0.4445), ('shelter', 0.4377), ('expected', 0.4122)]
[('california', 0.5802), ('wildfire', 0.4514), ('evacuation', 0.3488), ('receive', 0.193), ('order', 0.1451)]
[('wildfire', 0.4829), ('alaska', 0.4034), ('ruby', 0.2238), ('smoke', 0.1695), ('school', 0.1347)]
[('wildfire', 0.5164), ('california', 0.392), ('rockyfire', 0.385), ('closed', 0.306), ('lake', 0.2863)]
[('flooding', 0.6045), ('flood', 0.5895), ('rain', 0.4811), ('disaster', 0.3368), ('flash', 0.2961)]
[('wood', 0.6708), ('hill', 0.6018), ('im', 0.4283)]
[('emergency', 0.5933), ('evacuation', 0.5307), ('street', 0.5272), ('building', 0.4006), ('happening', 0.3171)]
[('tornado', 0.7809), ('afraid', 0.5161), ('coming', 0.2231), ('im', 0.1681), ('area', 0.126)]
[('died'

In [75]:
# Fill in the missing values with extracted keywords only for disaster tweets
df_train.loc[(df_train['keyword'] == 1) & (df_train['keyword'].isna()), 'keyword'] = df_train['text'].apply(extract_keywords)

In [77]:
df_train.head(50)

Unnamed: 0,id,keyword,location,text,target,clean
0,1,earthquake,,deed reason earthquake may allah forgive u,1.0,
1,4,forest,,forest fire near la ronge sask canada,1.0,
2,5,evacuation,,resident asked shelter place notified officer ...,1.0,
3,6,california,,people receive wildfire evacuation order calif...,1.0,
4,7,wildfire,,got sent photo ruby alaska smoke wildfire pour...,1.0,
5,8,wildfire,,rockyfire update california hwy closed directi...,1.0,
6,10,flooding,,flood disaster heavy rain cause flash flooding...,1.0,
7,13,wood,,im top hill see fire wood,1.0,
8,14,emergency,,there emergency evacuation happening building ...,1.0,
9,15,tornado,,im afraid tornado coming area,1.0,


In [73]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10876 entries, 0 to 10875
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        10876 non-null  int64  
 1   keyword   10875 non-null  object 
 2   location  10876 non-null  object 
 3   text      10876 non-null  object 
 4   target    7613 non-null   float64
 5   clean     7265 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 509.9+ KB


#### Extract locations with NER

# Import the spaCy model
nlp = en_core_web_md.load()