# Facciola NLP Disaster Tweet Model

- In this competition we are building an NLP model to predict whether a Tweet is about a real disaster or not. 

In [25]:
import warnings
import os
import pandas as pd
import numpy as np


warnings.filterwarnings('ignore')
DATA_DIR = os.path.join(os.getcwd(),'data') if os.environ['COMPUTERNAME'] == 'NFACCIOL-MOBL' else "/kaggle/input/nlp-getting-started/"

## Import the training data

In [26]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Import the test data

In [27]:
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## EDA
- examine the structure of the data

In [28]:
print("Train set info")
print(train_df.info())
print()
print("Test set info")
print(test_df.info())

Train set info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None

Test set info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
None


## Data Cleaning
- here we clean the text data by removing unneccssary characters, handling missing values, and normalizing text

In [29]:
import re
import nltk
from nltk.corpus import stopwords

nltk.set_proxy('http://proxy-dmz.intel.com:911/')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#print(stop_words)

def clean_text(text):
   #remove URLS
   text = re.sub(r'http\S+', '', text)
   #remove HTML tags
   text = re.sub(r'<.*?>', '', text)
   # Remove non-alphanumeric characters except hashtags and mentions
   text = re.sub(r'[^a-zA-Z0-9\s#@]', '', text)
   # Convert to lowercase
   text = text.lower()
   # Remove stopwords
   text = ' '.join([word for word in text.split() if word not in stop_words])
   return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

train_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nfacciol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason #earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby #alaska smoke #wildfires p...


## Feature Engineering
- **text length**: Calculate the length of each tweet. This can help capture information about tweet complexity or verbosity.
- **word count**: Count the number of words in each tweet, which may provide insight into tweet structure.
- **hashtag count**: Count the number of hashtags in each tweet, as this can be indicative of topic relevance or trending discussions.
- **mention count**: Count the number of user mentions, which can indicate the tweet's engagement level.
- **hasUrl**: Create a binary feature indicating whether the tweet contains a URL.
- **sentiment score**: Use a pre-trained sentiment analyzer to get a sentiment score for each tweet.
- **pos tags**: Count the occurrence of different parts of speech in each tweet.
- **profanity count**: Count the number of profane words in each tweet using a predefined list of profane words.

In [30]:
train_df['text_length'] = train_df['text'].apply(len)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length
5690,8120,rescued,"Winston-Salem, NC",'You can only be rescued from where you actual...,0,rescued actually pretend giorgio hiatt,107
3078,4415,electrocute,,Electric vs Gas brewing (not wanting to electr...,0,electric vs gas brewing wanting electrocute qu...,91
4835,6882,mass%20murder,New Sweden,The media needs to stop publicizing mass murde...,0,media needs stop publicizing mass murder many ...,135
3600,5140,fatal,,11-Year-Old Boy Charged With Manslaughter of T...,1,11yearold boy charged manslaughter toddler rep...,136
736,1066,bleeding,,@SoDamnTrue we know who u are you're a bleedi...,0,@sodamntrue know u youre bleeding heart wannab...,76


In [31]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(x.split()))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count
1595,2303,cliff%20fall,The Netherlands,#NowPlaying * Cliff Richard - I Could Easily F...,0,#nowplaying cliff richard could easily fall lo...,137,21
6198,8846,smoke,Rio de Janeiro,smoke whatever you got,0,smoke whatever got,22,4
3062,4393,earthquake,London,'There was a small earthquake in LA but don't ...,1,small earthquake la dont worry emmy rossum fine,72,14
226,321,annihilated,the own zone layer,day 1 of tryouts went good minus the fact I st...,0,day 1 tryouts went good minus fact stopped qui...,123,24
4519,6421,hurricane,The Globe,HURRICANE GUILLERMO LIVE NOAA TRACKING / LOOPI...,1,hurricane guillermo live noaa tracking looping...,134,13


In [32]:
train_df['hashtag_count'] = train_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('#')]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count
1150,1657,bombing,,@snapharmony : Bells toll in Hiroshima as Japa...,1,@snapharmony bells toll hiroshima japan marks ...,106,15,0
1387,2000,bush%20fires,"Sydney, Australia",SMH photographer Wolter Peeters was on the fro...,1,smh photographer wolter peeters front line nsw...,139,18,0
4874,6939,mass%20murderer,"Tama, Iowa",Nazi Mass Murderer Became Chairman At Vaccine ...,1,nazi mass murderer became chairman vaccine dru...,87,11,0
1857,2670,crush,GLOBAL,Had a minute alone with my crush??...it was an...,0,minute alone crushit overrated experiencesmh,73,11,0
3474,4969,explosion,Germany,I liked a @YouTube video http://t.co/bGAJ2oAX1...,1,liked @youtube video huge building explosion 2...,101,16,0


In [33]:
train_df['mention_count'] = train_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('@')]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count
7228,10350,weapons,Incognito,WOOOOOOO RT @GameRant: Call of Duty: Black Ops...,0,wooooooo rt @gamerant call duty black ops 3 es...,132,17,0,1
3375,4833,evacuation,USA,Bend Post Office roofers cut gas line prompt e...,1,bend post office roofers cut gas line prompt e...,80,11,0,0
6452,9231,suicide%20bombing,EARTH,@NBCPolitics RUSSIA AND THAT BACK FIRED NOW 20...,1,@nbcpolitics russia back fired 2015 look happe...,137,21,0,1
6225,8886,smoke,,I miss Josie cause I wanna smoke splifs and go...,0,miss josie cause wanna smoke splifs go taco bell,62,14,0,0
5099,7274,nuclear%20disaster,,3 Former Executives To Be Prosecuted In Fukush...,1,3 former executives prosecuted fukushima nucle...,89,11,0,0


In [34]:
train_df['has_url'] = train_df['text'].apply(lambda x: 1 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", x) else 0)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url
6496,9288,sunk,,Everything has sunk in except the fact that I ...,0,everything sunk except fact actually moving st...,140,27,0,0,0
4665,6631,inundated,"Paducah, KY",@Bilsko and suddenly I'm inundated with resear...,1,@bilsko suddenly im inundated research @humoft...,63,8,0,2,0
2805,4033,disaster,,@cncpts @SOLELINKS what a disaster - can't say...,0,@cncpts @solelinks disaster cant say im surprised,60,10,0,2,0
6727,9640,thunderstorm,,Falling asleep to the sounds to thousands of R...,0,falling asleep sounds thousands river plate fa...,111,18,1,0,0
2222,3180,deluge,London,Perhaps 'historic' should be applied not to th...,1,perhaps historic applied deluge recently expos...,136,21,1,0,0


In [35]:
from textblob import TextBlob
train_df['sentiment_score'] = train_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score
7199,10314,weapon,"California, United States",#InsaneLimits #plugin enabled @' =TPS= | T...,0,#insanelimits #plugin enabled @ tps tdm 400t h...,106,19,2,1,0,0.0
212,299,annihilated,Boksburg,@marksmaponyane Hey!Sundowns were annihilated ...,0,@marksmaponyane heysundowns annihilated previo...,110,13,0,1,0,-0.166667
3843,5468,flames,"Manhattan, NY",'if you can't summon the flames directly from ...,0,cant summon flames directly hell store bought ...,87,14,0,0,0,0.258333
3375,4833,evacuation,USA,Bend Post Office roofers cut gas line prompt e...,1,bend post office roofers cut gas line prompt e...,80,11,0,0,1,0.0
6632,9499,terrorist,,HereÛªs how media in Pakistan covered the cap...,1,heres media pakistan covered capture terrorist...,101,13,0,0,1,0.0


In [36]:
import spacy
nlp = spacy.load('en_core_web_sm')
train_df['noun_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'NOUN' or token.pos_ == 'PROPN']))
train_df['verb_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'VERB']))
train_df['adverb_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADV']))
train_df['adjective_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADJ']))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count
837,1215,blizzard,United States,@LoneWolffur control yourself tora,0,@lonewolffur control tora,34,4,0,1,0,0.0,2,1,0,0
3378,4836,evacuation,UK,FAAN gives owners of abandoned aircraft evacua...,1,faan gives owners abandoned aircraft evacuatio...,131,16,0,1,1,0.0,6,4,1,0
423,613,arsonist,NYC :) Ex- #Islamophobe,#Vegetarian #Vegan Video shows arsonist torchi...,0,#vegetarian #vegan video shows arsonist torchi...,136,14,4,0,1,0.6,9,3,3,1
3326,4765,evacuated,"Chicago, IL",Green line service on south side disrupted aft...,1,green line service south side disrupted cta tr...,134,15,0,0,1,-0.2,9,3,0,2
349,501,army,,22.Beyonce Is my pick for http://t.co/thoYhrHk...,0,22beyonce pick fan army #beyhive,89,10,1,0,1,0.0,7,0,0,0


In [37]:
from better_profanity import profanity

train_df['profanity_count'] = train_df['text'].apply(lambda x: len([w for w in x if w in profanity.CENSOR_WORDSET]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count,profanity_count
3341,4785,evacuated,"Queensland, Australia",Passengers evacuated &amp; lanes blocked off a...,1,passengers evacuated amp lanes blocked power l...,131,18,0,1,1,-0.155556,8,3,0,0,0
2725,3914,devastated,London,ÛÏRichmond Coaches were devastated to hear of...,1,richmond coaches devastated hear death second ...,139,20,0,0,1,0.0,9,2,0,1,0
4147,5894,harm,where the wild things are,I concur. The longer you spend with your child...,0,concur longer spend child harm mmk,90,14,0,0,1,0.5,3,3,1,1,0
4942,7044,mayhem,107-18 79TH STREET,#NoSurrender Results: Full Metal Mayhem World ...,0,#nosurrender results full metal mayhem world t...,135,21,1,0,1,0.4875,12,2,0,2,0
5889,8410,sandstorm,USA,Watch This Airport Get Swallowed Up By A Sands...,1,watch airport get swallowed sandstorm minute,91,14,0,0,1,0.0,5,2,0,0,0


### Test set Feature engineering
- now apply the same to the test set

In [38]:
test_df['text_length'] = test_df['text'].apply(len)
test_df['word_count'] = test_df['text'].apply(lambda x: len(x.split()))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('#')]))
test_df['mention_count'] = test_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('@')]))
test_df['has_url'] = test_df['text'].apply(lambda x: 1 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", x) else 0)
test_df['sentiment_score'] = test_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
test_df['noun_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'NOUN' or token.pos_ == 'PROPN']))
test_df['verb_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'VERB']))
test_df['adverb_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADV']))
test_df['adjective_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADJ']))
test_df['profanity_count'] = test_df['text'].apply(lambda x: len([w for w in x if w in profanity.CENSOR_WORDSET]))

In [40]:
test_df.sample(5)

Unnamed: 0,id,keyword,location,text,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count,profanity_count
1032,3384,demolition,,@Johnny_Detroit Tag Team for me was Demolition...,@johnnydetroit tag team demolition awesome int...,117,17,0,1,0,0.25,8,2,2,1,0
972,3224,deluged,,Businesses a e deluged with invoices. Make you...,businesses e deluged invoices make standwout c...,132,25,0,0,0,0.25,8,3,0,1,0
634,2066,casualty,among the socially awkward ?,@5SOSFamUpdater social casualty,@5sosfamupdater social casualty,31,3,0,1,0,0.033333,1,1,0,1,0
131,416,apocalypse,ColoRADo,@TMFK_CO sounds like a terrible time. I'll be ...,@tmfkco sounds like terrible time ill right,58,10,0,1,0,-0.357143,2,1,2,1,0
2658,8879,smoke,,[55432] 1950 LIONEL TRAINS SMOKE LOCOMOTIVES W...,55432 1950 lionel trains smoke locomotives mag...,123,11,0,0,1,0.0,7,1,0,1,0


## TF-IDF Vectorization
- Convert the cleaned text data into numerical features using TF-IDF

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])

# Transform the test data
X_test_tfidf = tfidf.transform(test_df['clean_text'])

## BERT Embeddings
- Generate BERT embeddings for the text data

In [42]:
from transformers import BertModel, BertTokenizer
import torch

#load tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-based-uncased')

#tokenize and encode the text
# Tokenize and encode the text
def get_bert_embeddings(text_list):
   inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
   with torch.no_grad():
      outputs = bert_model(**inputs)
   return outputs.last_hidden_state[:, 0, :].numpy()

# Get BERT embeddings for train and test data
X_train_bert = get_bert_embeddings(train_df['clean_text'].tolist())
X_test_bert = get_bert_embeddings(test_df['clean_text'].tolist())

OSError: Can't load tokenizer for 'bert-base-uncased'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'bert-base-uncased' is the correct path to a directory containing all relevant files for a BertTokenizer tokenizer.

In [None]:
X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_bert, 
                              train_df[['text_length', 'word_count', 'hashtag_count', 'mention_count', 'has_url', 
                                       'sentiment_score', 'noun_count', 'verb_count', 'adverb_count', 'adjective_count', 'profanity_count']].values))