**Approach - <br>
a) Bag of words model using CountVectorizer<br>
b) Naive Bayes classification**

In [1]:
import regex as re
import string

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.naive_bayes import BernoulliNB

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Data Reading and Understanding**

In [3]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

**Classes seems to be balanced**

**Checking- if features keyword and location are useful and should be retained**

In [6]:
train_df[~train_df['keyword'].isnull()][['keyword', 'text']]

Unnamed: 0,keyword,text
31,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...
32,ablaze,We always try to bring the heavy. #metal #RT h...
33,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...
34,ablaze,Crying out for more! Set me ablaze
35,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...
...,...,...
7578,wrecked,@jt_ruff23 @cameronhacker and I wrecked you both
7579,wrecked,Three days off from work and they've pretty mu...
7580,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...
7581,wrecked,@engineshed Great atmosphere at the British Li...


In [7]:
len(train_df['location'].unique())

3342

**Dropping the features: keyword and location**

In [8]:
train_df.drop(columns=['keyword', 'location'], inplace=True)

**Data Cleaning and Preprocessing**

In [9]:
tokenizer = TweetTokenizer()
stemmer = SnowballStemmer(language='english')

In [10]:
def preprocess_text(text):
  cleaned_text = text.lower()
  
  #remove words like hashtags, web addresses
  cleaned_text = re.sub(r'#\S+|@\S+|<.*?>|http\S+', '', cleaned_text)

  cleaned_text = tokenizer.tokenize(cleaned_text)
  cleaned_text = ' '.join([stemmer.stem(word) for word in cleaned_text if word not in string.punctuation and word not in stopwords.words('english')])

  return cleaned_text

In [11]:
train_df['cleaned_text'] = train_df['text'].apply(preprocess_text)

In [12]:
train_df['cleaned_text'][0:5]

0                      deed reason may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3           13,000 peopl receiv evacu order california
4                got sent photo rubi smoke pour school
Name: cleaned_text, dtype: object

In [13]:
#Generate BOW with CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_df['cleaned_text']).toarray()
y = train_df['target']

**Modelling**

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, stratify=y,random_state=100)

In [15]:
print ("train: ", X_train.shape)
print ("test: ", X_val.shape)

train:  (5329, 5000)
test:  (2284, 5000)


In [16]:
bnb = BernoulliNB(alpha=1.0, fit_prior=True)
bnb.fit(X_train, y_train)

y_train_pred = bnb.predict(X_train)
y_val_pred = bnb.predict(X_val)

print ("train score: ", f1_score(y_train, y_train_pred))
print ("test score: ", f1_score(y_val, y_val_pred))

train score:  0.8388478933587241
test score:  0.7401224262659989


**Prediction on test set to be evaluated**

In [17]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
print (test_df.head())
print (test_df.shape)

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan
(3263, 4)


In [18]:
test_df.drop(columns=['keyword', 'location'], inplace=True)

In [19]:
test_df['cleaned_text'] = test_df['text'].apply(preprocess_text)
test_X = vectorizer.transform(test_df['cleaned_text'])
test_y = bnb.predict(test_X)

In [20]:
submission = pd.DataFrame({'id':test_df.id, 'target':test_y})
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [21]:
submission.to_csv('./submission.csv', index=False)