# Initial Data Processing

## Installing of Necessary Versions from Imports

In [1]:
pip install -r requirements.txt

--- Logging error ---
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pip/_internal/utils/logging.py", line 177, in emit
    self.console.print(renderable, overflow="ignore", crop=False, style=style)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pip/_vendor/rich/console.py", line 1673, in print
    extend(render(renderable, render_options))
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pip/_vendor/rich/console.py", line 1305, in render
    for render_output in iter_render:
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pip/_internal/utils/logging.py", line 134, in __rich_console__
    for line in lines:
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/pip/_vendor/rich/segment.py", line 249, in split_lines
    for segment in segments:
  File "/Library/

## Loading of Data

In [2]:
# For Basic Data Handling and Manipulation
import pandas as pd
import numpy as np

# For Text Processing
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import urllib.parse
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


# For Neural Networks 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# For Transformers and Pre-Trained Models
from transformers import BertTokenizer, TFBertModel, BertConfig

# For Model Evaluation
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# For Data Visualization
import matplotlib.pyplot as plt  

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phoenix/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/phoenix/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/phoenix/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/phoenix/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Loading the Training and Testing Data and Data Processing

In [3]:
# Load dataset
train = pd.read_csv("./nlp-getting-started/train.csv") 

test = pd.read_csv("./nlp-getting-started/test.csv")

# Drop N/A values
train.dropna(inplace=True)

test.dropna(inplace=True)

# Get total count of rows
train.count()

### Current count is 5080 post removal of N/A


id          5080
keyword     5080
location    5080
text        5080
target      5080
dtype: int64

In [4]:
# Function to clean the text

def clean_text(text):

    # Make text in lower case
    text = text.lower()

    # Remove Punctuation
    text = text.translate(str.maketrans('', '', string.punctuation.replace("'", "")))  
    # arguments of str.maketrans(to_be_replaced, replacing, to_be_deleted)
    # string.punctuation is a predefined function that contains all punctuations like !, #, ;, etc.
    # the .replace for the same is used to remove the apostrophe from the list of punctuations.
    # So, in the end text will 1) have nothing to replace, 2) replace with nothing, 3) delete all punctuations except "'".

    # Tokenizing the text
    tokens = word_tokenize(text)

    # Removing the stop words
    tokens = [word for word in tokens if word not in stopwords.words('english')]

    # Choosing between stemming and lemmatizing based on case, here Lemmatizing will be better.
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens) # used to get a single string of tokens separated by spaces 


In [5]:
print(train.columns)

print(train[:5])

#printing rows

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
    id keyword                       location  \
31  48  ablaze                     Birmingham   
32  49  ablaze  Est. September 2012 - Bristol   
33  50  ablaze                         AFRICA   
34  52  ablaze               Philadelphia, PA   
35  53  ablaze                     London, UK   

                                                 text  target  
31  @bbcmtd Wholesale Markets ablaze http://t.co/l...       1  
32  We always try to bring the heavy. #metal #RT h...       0  
33  #AFRICANBAZE: Breaking news:Nigeria flag set a...       1  
34                 Crying out for more! Set me ablaze       0  
35  On plus side LOOK AT THE SKY LAST NIGHT IT WAS...       0  


In [6]:
# Apply cleaning function to all of the necessary columns
train['keyword_new'] = train['keyword'].apply(clean_text)
test['keyword_new'] = test['keyword'].apply(clean_text)

train['location_new'] = train['location'].apply(clean_text)
test['location_new'] = test['location'].apply(clean_text)

train['text_new'] = train['text'].apply(clean_text)
test['text_new'] = test['text'].apply(clean_text)


# Decoding the keywords
train['keyword'] = train['keyword'].apply(urllib.parse.unquote)
test['keyword'] = test['keyword'].apply(urllib.parse.unquote)


# Prepare features and target for modeling
features = train[['keyword_new', 'location_new', 'text_new']]  
target = train['target']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)


In [7]:
print(train.columns)

print(train[:5])

# printing rows

Index(['id', 'keyword', 'location', 'text', 'target', 'keyword_new',
       'location_new', 'text_new'],
      dtype='object')
    id keyword                       location  \
31  48  ablaze                     Birmingham   
32  49  ablaze  Est. September 2012 - Bristol   
33  50  ablaze                         AFRICA   
34  52  ablaze               Philadelphia, PA   
35  53  ablaze                     London, UK   

                                                 text  target keyword_new  \
31  @bbcmtd Wholesale Markets ablaze http://t.co/l...       1      ablaze   
32  We always try to bring the heavy. #metal #RT h...       0      ablaze   
33  #AFRICANBAZE: Breaking news:Nigeria flag set a...       1      ablaze   
34                 Crying out for more! Set me ablaze       0      ablaze   
35  On plus side LOOK AT THE SKY LAST NIGHT IT WAS...       0      ablaze   

                  location_new  \
31                  birmingham   
32  est september 2012 bristol   
33           

# Embedding Methods

## TF-IDF

In [8]:
vectorizer = TfidfVectorizer(max_features=400)
# Initializing the vectorizer with max_features set to 1) 400

# Combine all text data into one corpus for fitting
combined_train_text = X_train['keyword_new'] + ' ' + X_train['location_new'] + ' ' + X_train['text_new']
vectorizer.fit(combined_train_text)  # Fit once on the combined corpus

# Transform each set separately without refitting
tfidf_k_train = vectorizer.transform(X_train['keyword_new']).toarray()
tfidf_l_train = vectorizer.transform(X_train['location_new']).toarray()
tfidf_t_train = vectorizer.transform(X_train['text_new']).toarray()

tfidf_k_val = vectorizer.transform(X_val['keyword_new']).toarray()
tfidf_l_val = vectorizer.transform(X_val['location_new']).toarray()
tfidf_t_val = vectorizer.transform(X_val['text_new']).toarray()

tfidf_k_test = vectorizer.transform(test['keyword_new']).toarray()
tfidf_l_test = vectorizer.transform(test['location_new']).toarray()
tfidf_t_test = vectorizer.transform(test['text_new']).toarray()

# Concatenate transformed data for each set
X_train_tfidf = np.concatenate((tfidf_k_train, tfidf_l_train, tfidf_t_train), axis=1)
X_val_tfidf = np.concatenate((tfidf_k_val, tfidf_l_val, tfidf_t_val), axis=1)
X_test_tfidf = np.concatenate((tfidf_k_test, tfidf_l_test, tfidf_t_test), axis=1)

In [9]:
# Check unique values in the 'keyword' column
unique_keywords = train['keyword'].unique()

# Count of unique keywords
unique_keyword_count = len(unique_keywords)

print(f"Number of unique keywords: {unique_keyword_count}")


Number of unique keywords: 221


In [10]:
# Print unique keywords
print(unique_keywords)

# Frequency of each keyword
keyword_frequency = train['keyword'].value_counts()
print(keyword_frequency)


['ablaze' 'accident' 'aftershock' 'airplane accident' 'ambulance'
 'annihilated' 'annihilation' 'apocalypse' 'armageddon' 'army' 'arson'
 'arsonist' 'attack' 'attacked' 'avalanche' 'battle' 'bioterror'
 'bioterrorism' 'blaze' 'blazing' 'bleeding' 'blew up' 'blight' 'blizzard'
 'blood' 'bloody' 'blown up' 'body bag' 'body bagging' 'body bags' 'bomb'
 'bombed' 'bombing' 'bridge collapse' 'buildings burning'
 'buildings on fire' 'burned' 'burning' 'burning buildings' 'bush fires'
 'casualties' 'casualty' 'catastrophe' 'catastrophic' 'chemical emergency'
 'cliff fall' 'collapse' 'collapsed' 'collide' 'collided' 'collision'
 'crash' 'crashed' 'crush' 'crushed' 'curfew' 'cyclone' 'damage' 'danger'
 'dead' 'death' 'deaths' 'debris' 'deluge' 'deluged' 'demolish'
 'demolished' 'demolition' 'derail' 'derailed' 'derailment' 'desolate'
 'desolation' 'destroy' 'destroyed' 'destruction' 'detonate' 'detonation'
 'devastated' 'devastation' 'disaster' 'displaced' 'drought' 'drown'
 'drowned' 'drowning'

## Word2Vec

## BERT

# Model Training and Evaluation

## Logistic Regression

### With TF-IDF

In [11]:
# Concatenating the training, validation and test TF-IDF vectors for the features that had TF-IDF embeddings 
X_train_tfidf = np.concatenate((tfidf_k_train, tfidf_l_train, tfidf_t_train), axis=1)
X_test_tfidf = np.concatenate((tfidf_k_test, tfidf_l_test, tfidf_t_test), axis=1)
X_val_tfidf = np.concatenate((tfidf_k_val, tfidf_l_val, tfidf_t_val), axis=1)

# Implementing a new Log Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

# Predict on the test set
predictions = model.predict(X_val_tfidf)

# Evaluate the model
print("Accuracy:", accuracy_score(y_val, predictions))
print(classification_report(y_val, predictions))

Accuracy: 0.7667322834645669
              precision    recall  f1-score   support

           0       0.78      0.83      0.80       580
           1       0.75      0.69      0.72       436

    accuracy                           0.77      1016
   macro avg       0.76      0.76      0.76      1016
weighted avg       0.77      0.77      0.77      1016



## Naive Bayes

### With TF-IDF

In [12]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the validation set
nb_predictions = nb_classifier.predict(X_val_tfidf)

# Evaluate the model
print("Naive Bayes Accuracy:", accuracy_score(y_val, nb_predictions))
print(classification_report(y_val, nb_predictions))

Naive Bayes Accuracy: 0.7667322834645669
              precision    recall  f1-score   support

           0       0.79      0.80      0.80       580
           1       0.73      0.72      0.73       436

    accuracy                           0.77      1016
   macro avg       0.76      0.76      0.76      1016
weighted avg       0.77      0.77      0.77      1016

