# Facciola NLP Disaster Tweet Model

- In this competition we are building an NLP model to predict whether a Tweet is about a real disaster or not. 

In [1]:
import warnings
import os
import pandas as pd
import numpy as np


warnings.filterwarnings('ignore')
DATA_DIR = os.path.join(os.getcwd(),'data') if os.environ['COMPUTERNAME'] == 'NFACCIOL-MOBL' else "/kaggle/input/nlp-getting-started/"

## Import the training data

In [2]:
train_df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


## Import the test data

In [3]:
test_df = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## EDA
- examine the structure of the data

In [4]:
print("Train set info")
print(train_df.info())
print()
print("Test set info")
print(test_df.info())

Train set info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None

Test set info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB
None


## Data Cleaning
- here we clean the text data by removing unneccssary characters, handling missing values, and normalizing text

In [5]:
import re
import nltk
from nltk.corpus import stopwords

nltk.set_proxy('http://proxy-dmz.intel.com:911/')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
#print(stop_words)

def clean_text(text):
   #remove URLS
   text = re.sub(r'http\S+', '', text)
   #remove HTML tags
   text = re.sub(r'<.*?>', '', text)
   # Remove non-alphanumeric characters except hashtags and mentions
   text = re.sub(r'[^a-zA-Z0-9\s#@]', '', text)
   # Convert to lowercase
   text = text.lower()
   # Remove stopwords
   text = ' '.join([word for word in text.split() if word not in stop_words])
   return text

train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

train_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nfacciol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,keyword,location,text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,deeds reason #earthquake may allah forgive us
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,residents asked shelter place notified officer...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive #wildfires evacuation ord...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby #alaska smoke #wildfires p...


## Feature Engineering
- **text length**: Calculate the length of each tweet. This can help capture information about tweet complexity or verbosity.
- **word count**: Count the number of words in each tweet, which may provide insight into tweet structure.
- **hashtag count**: Count the number of hashtags in each tweet, as this can be indicative of topic relevance or trending discussions.
- **mention count**: Count the number of user mentions, which can indicate the tweet's engagement level.
- **hasUrl**: Create a binary feature indicating whether the tweet contains a URL.
- **sentiment score**: Use a pre-trained sentiment analyzer to get a sentiment score for each tweet.
- **pos tags**: Count the occurrence of different parts of speech in each tweet.
- **profanity count**: Count the number of profane words in each tweet using a predefined list of profane words.

In [6]:
train_df['text_length'] = train_df['text'].apply(len)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length
2212,3168,deluge,617-BTOWN-BEATDOWN,Photo: forrestmankins: Colorado camping. http:...,0,photo forrestmankins colorado camping,63
751,1083,blew%20up,Indiana,My Instagram just blew up apparently I was fea...,0,instagram blew apparently featured jazz tonigh...,99
6569,9402,survivors,,Dear @POTUS In the name of humanityI apologize...,1,dear @potus name humanityi apologized #hiroshi...,136
1865,2680,crush,"Washington, DC NATIVE",#MrRobinson is giving me #TheSteveHarveyShow v...,0,#mrrobinson giving #thesteveharveyshow vibe mu...,137
2025,2908,danger,ayr,Danger of union bears http://t.co/lhdcpNZx6A,0,danger union bears,44


In [7]:
train_df['word_count'] = train_df['text'].apply(lambda x: len(x.split()))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count
5328,7607,pandemonium,,I'll be at SFA very soon....#Pandemonium http:...,1,ill sfa soon#pandemonium,63,7
2811,4043,disaster,USA,DISASTER AVERTED: Police kill gunman with Û÷h...,0,disaster averted police kill gunman hoax devic...,93,10
1881,2702,crush,,kenny holland crush da vida,0,kenny holland crush da vida,27,5
84,121,accident,"South Bloomfield, OH",Accident in #Ashville on US 23 SB before SR 75...,1,accident #ashville us 23 sb sr 752 #traffic,79,12
5396,7698,panicking,,@Dirk_NoMissSki yea but if someone faints why ...,1,@dirknomissski yea someone faints panicking th...,88,14


In [8]:
train_df['hashtag_count'] = train_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('#')]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count
2130,3058,deaths,,This why BSF Jawans died Fidayeen has AKs and ...,1,bsf jawans died fidayeen aks bloody #insas ins...,139,22,1
1838,2643,crashed,Kingswinford,I just nearly crashed my car typing 'Paul Rudd...,0,nearly crashed car typing paul rudd attacked f...,95,18,0
4916,7001,mayhem,"Orlando, FL",I guess ill never be able to go to mayhem...,0,guess ill never able go mayhem,44,10,0
4709,6695,landslide,,Landslide kills three near Venice after heavyå...,1,landslide kills three near venice heavyrain,74,8,0
4589,6527,injuries,"Georgia, U.S.A.",@msnbc What a fucking idiot. He had a gun &amp...,1,@msnbc fucking idiot gun amp hatchet yet still...,127,23,0


In [9]:
train_df['mention_count'] = train_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('@')]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count
4332,6152,hijack,"Port Harcourt, Nigeria",Plans by former First Lady and wife of ex-Pres...,1,plans former first lady wife expresident goodl...,136,19,0,0
134,194,aftershock,304,'The first man gets the oyster the second man ...,0,first man gets oyster second man gets shell an...,78,14,0,0
4969,7081,meltdown,"Leeds, England",Pam's Barry Island wedding meltdown ??????????,0,pams barry island wedding meltdown,46,6,0,0
5916,8447,screamed,,I JUST SCREAMED SIDJSJDJEKDJSKDJD . I CANT STA...,0,screamed sidjsjdjekdjskdjd cant stand,79,11,0,0
1111,1604,bombed,Atlanta Georgia,@WhiteHouse @POTUS Just cos Germany invaded Po...,1,@whitehouse @potus cos germany invaded poland ...,119,14,0,2


In [10]:
train_df['has_url'] = train_df['text'].apply(lambda x: 1 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", x) else 0)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url
7443,10651,wounds,,white ppl bruise easily.. where ur bullet woun...,0,white ppl bruise easily ur bullet wounds,80,11,0,0,1
3105,4456,electrocuted,New York,Woman electrocuted #Red #Redblood #videoclip h...,0,woman electrocuted #red #redblood #videoclip #,69,7,4,0,1
356,513,army,Studio,But if you build an army of 100 dogs and their...,1,build army 100 dogs leader lion dogs fight lik...,96,22,0,0,0
1146,1650,bombing,SWMO,Japan Marks 70th Anniversary of Hiroshima Atom...,1,japan marks 70th anniversary hiroshima atomic ...,79,9,0,0,1
2288,3283,demolish,us-east-1a,Read this already in '14 but it was and remain...,0,read already 14 remains one favorite articles ...,138,21,0,0,1


In [11]:
from textblob import TextBlob
train_df['sentiment_score'] = train_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score
30,44,,,The end!,0,end,8,2,0,0,0,0.0
6679,9571,thunder,Gander NF,Random wind gust just came through #Gander. P...,1,random wind gust came #gander probably convect...,139,23,2,0,0,-0.35
6866,9838,trauma,"Nashville, TN",Esteemed journalist recalls tragic effects of ...,1,esteemed journalist recalls tragic effects una...,140,13,2,3,1,-0.75
3034,4355,earthquake,oklahoma,Posted a new song: 'Earthquake' http://t.co/Rf...,0,posted new song earthquake,77,7,0,0,1,0.136364
1522,2198,catastrophic,Planet Earth,Learning from the Legacy of a Catastrophic Eru...,1,learning legacy catastrophic eruption new yorker,91,13,0,0,1,0.136364


In [12]:
import spacy
nlp = spacy.load('en_core_web_sm')
train_df['noun_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'NOUN' or token.pos_ == 'PROPN']))
train_df['verb_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'VERB']))
train_df['adverb_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADV']))
train_df['adjective_count'] = train_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADJ']))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count
2184,3131,debris,,Aircraft debris found on island is from MH370 ...,1,aircraft debris found island mh370 malaysia co...,86,11,0,0,1,0.0,6,2,0,0
795,1153,blight,,http://t.co/ETkd58Un8n - Cleveland Heights Sha...,0,cleveland heights shaker heights fight blight ...,114,13,0,0,1,0.0,10,1,0,0
148,212,airplane%20accident,Indonesia,#KCA #VoteJKT48ID mbataweel: #RIP #BINLADEN Fa...,1,#kca #votejkt48id mbataweel #rip #binladen fam...,95,13,4,0,0,-0.2,8,2,0,0
6318,9030,stretcher,??,Stretcher in 5 min // Speaker Deck http://t.co...,0,stretcher 5 min speaker deck,57,8,0,0,1,0.0,4,0,0,1
7050,10101,typhoon,The Peach State,I think a Typhoon just passed through here lol,1,think typhoon passed lol,46,9,0,0,0,0.8,2,2,2,0


In [13]:
from better_profanity import profanity

train_df['profanity_count'] = train_df['text'].apply(lambda x: len([w for w in x if w in profanity.CENSOR_WORDSET]))
train_df.sample(5)

Unnamed: 0,id,keyword,location,text,target,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count,profanity_count
2669,3831,detonate,,@WoundedPigeon http://t.co/s9soAeVcVo Detonate...,0,@woundedpigeon detonate @apollobrown ft mop,73,7,0,2,1,0.0,5,0,0,0,0
5156,7355,obliterate,United Kingdom,@klavierstuk doesn't so LVG is forced into the...,0,@klavierstuk doesnt lvg forced market may beat...,139,25,0,1,0,-0.075,9,3,1,2,0
255,363,annihilation,United States,Are souls punished withåÊannihilation? http://...,0,souls punished withannihilation,84,6,0,0,1,0.0,2,1,0,0,0
2275,3263,demolish,,@kirkmin after listening to you demolish @Bart...,0,@kirkmin listening demolish @barthubbuch @weei...,135,22,0,3,0,0.0,9,3,0,1,0
7579,10831,wrecked,"Vancouver, Canada",Three days off from work and they've pretty mu...,0,three days work theyve pretty much wrecked hah...,107,20,0,0,0,0.216667,5,1,3,0,0


### Test set Feature engineering
- now apply the same to the test set

In [14]:
test_df['text_length'] = test_df['text'].apply(len)
test_df['word_count'] = test_df['text'].apply(lambda x: len(x.split()))
test_df['hashtag_count'] = test_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('#')]))
test_df['mention_count'] = test_df['text'].apply(lambda x: len([w for w in x.split() if w.startswith('@')]))
test_df['has_url'] = test_df['text'].apply(lambda x: 1 if re.search("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", x) else 0)
test_df['sentiment_score'] = test_df['text'].apply(lambda x: TextBlob(x).sentiment.polarity)
test_df['noun_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'NOUN' or token.pos_ == 'PROPN']))
test_df['verb_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'VERB']))
test_df['adverb_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADV']))
test_df['adjective_count'] = test_df['text'].apply(lambda x: len([token.pos_ for token in nlp(x) if token.pos_ == 'ADJ']))
test_df['profanity_count'] = test_df['text'].apply(lambda x: len([w for w in x if w in profanity.CENSOR_WORDSET]))

In [15]:
test_df.sample(5)

Unnamed: 0,id,keyword,location,text,clean_text,text_length,word_count,hashtag_count,mention_count,has_url,sentiment_score,noun_count,verb_count,adverb_count,adjective_count,profanity_count
826,2716,crushed,"Motown, WV",So in one episode they undo season 1. Kai join...,one episode undo season 1 kai joins ff ren bea...,117,23,0,0,0,-0.25,9,4,1,0,0
348,1127,blew%20up,twitch.tv/dgn_esports,The only reason why player's now have an ego i...,reason players ego cause mw3 cod champs thats ...,140,27,0,0,0,0.0,10,3,3,1,0
325,1056,bleeding,"Louisville, KY",Looks like Reynolds and Montano coming in. Nee...,looks like reynolds montano coming need stop b...,102,18,0,0,0,0.55,4,4,2,1,0
1711,5774,forest%20fires,,Q: Why do ducks have big flat feet? A: To stam...,q ducks big flat feet stamp forest fires q ele...,136,28,0,0,0,-0.0125,11,5,0,4,0
2272,7575,outbreak,Dubai,Families to sue over Legionnaires: More than 4...,families sue legionnaires 40 families affected...,136,18,0,0,1,0.5,7,2,0,2,0


## TF-IDF Vectorization
- Convert the cleaned text data into numerical features using TF-IDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=10000)

# Fit and transform the training data
X_train_tfidf = tfidf.fit_transform(train_df['clean_text'])

# Transform the test data
X_test_tfidf = tfidf.transform(test_df['clean_text'])

## BERT Embeddings
- Generate BERT embeddings for the text data

In [19]:
from transformers import BertModel, BertTokenizer
import torch

bert_path = os.path.join(os.getcwd(), 'bert-base-uncased')

#load tokenizer and BERT model
tokenizer = BertTokenizer.from_pretrained(bert_path, local_files_only=True)
bert_model = BertModel.from_pretrained(bert_path, local_files_only=True)

#tokenize and encode the text
# Tokenize and encode the text
def get_bert_embeddings(text_list):
   inputs = tokenizer(text_list, return_tensors='pt', padding=True, truncation=True, max_length=512)
   with torch.no_grad():
      outputs = bert_model(**inputs)
   return outputs.last_hidden_state[:, 0, :].numpy()

# Get BERT embeddings for train and test data
X_train_bert = get_bert_embeddings(train_df['clean_text'].tolist())
X_test_bert = get_bert_embeddings(test_df['clean_text'].tolist())



In [21]:
X_train_combined = np.hstack((X_train_tfidf.toarray(), X_train_bert, 
                              train_df[['text_length', 'word_count', 'hashtag_count', 'mention_count', 'has_url', 
                                       'sentiment_score', 'noun_count', 'verb_count', 'adverb_count', 'adjective_count', 'profanity_count']].values))

X_test_combined = np.hstack((X_test_tfidf.toarray(), X_test_bert, 
                              test_df[['text_length', 'word_count', 'hashtag_count', 'mention_count', 'has_url', 
                                       'sentiment_score', 'noun_count', 'verb_count', 'adverb_count', 'adjective_count', 'profanity_count']].values))

## Model Selection
- here we test a variety of models and choose a few to fine tune based on classification report 

In [22]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train_combined, train_df['target'], test_size=0.2, random_state=42)

# Standardize the data for models that require it
scaler = StandardScaler()
X_train_split_scaled = scaler.fit_transform(X_train_split)
X_val_split_scaled = scaler.transform(X_val_split)

models = {
   'Logistic Regression' : LogisticRegression(max_iter=1000),
   'SVM' : SVC(),
   'Random Forest' : RandomForestClassifier(random_state=42),
   'KNN' : KNeighborsClassifier(),
   'Gradient Boosting' : GradientBoostingClassifier(),
   'Naive Bayes' : GaussianNB(),
   'Neural Network' : MLPClassifier(max_iter=1000)
}

# Evaluate each model
for name, model in models.items():
   if name in ['Logistic Regression', 'SVM', 'Neural Network']:
      model.fit(X_train_split_scaled, y_train_split)
      y_pred = model.predict(X_val_split_scaled)
   else:
      model.fit(X_train_split, y_train_split)
      y_pred = model.predict(X_val_split)
   
   print(f"Model: {name}")
   print(classification_report(y_val_split, y_pred))
   print('-' * 60)

Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.79      0.78      0.78       874
           1       0.71      0.71      0.71       649

    accuracy                           0.75      1523
   macro avg       0.75      0.75      0.75      1523
weighted avg       0.75      0.75      0.75      1523

------------------------------------------------------------
Model: SVM
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       874
           1       0.87      0.65      0.74       649

    accuracy                           0.81      1523
   macro avg       0.83      0.79      0.80      1523
weighted avg       0.82      0.81      0.80      1523

------------------------------------------------------------
Model: Random Forest
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       874
           1       0.79      0.66      0.72       649

 

## Hyperparameter Tuning

In [23]:
hyperparameter_grid = {
   ('Gradient Boosting' , models.get('Gradient Boosting')) : {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'subsample': [0.8, 0.9, 1.0]},
   ('Random Forest', models.get('Random Forest')) : {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
   ('Neural Network', models.get('Neural Network')) : { 'hidden_layer_sizes': [(50,), (100,), (100, 50)], 'activation': ['tanh', 'relu'], 'solver': ['sgd', 'adam'], 'alpha': [0.0001, 0.001, 0.01], 'learning_rate': ['constant', 'adaptive']},
   ('SVM', models.get('SVM')) : {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']}
}

for grid in hyperparameter_grid.keys():
   grid_search = GridSearchCV(grid[1], hyperparameter_grid[grid], cv=5, scoring='accuracy', n_jobs=-1)
   if grid[0] in ['Neural Network', 'SVM']:
      grid_search.fit(X_train_split_scaled, y_train_split)
   else:
      grid_search.fit(X_train_split, y_train_split)
   print(f"Best Parameters for {grid[0]}:", grid_search.best_params_)

KeyboardInterrupt: 