**NLTK Naive Bayes Classifier using SnowballStemmer**

In [1]:
import regex as re
import string
import random

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import f1_score

import spacy

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

**Data Reading and Understanding**

In [3]:
train_df = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print (train_df.info())
print ("# of training records: ", len(train_df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
# of training records:  7613


In [5]:
train_df['target'].value_counts(normalize=True)

0    0.57034
1    0.42966
Name: target, dtype: float64

In [6]:
test_df = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


**Checking- if features keyword and location are useful and should be retained**

In [7]:
train_df[~train_df['keyword'].isnull()][['keyword', 'text']]

Unnamed: 0,keyword,text
31,ablaze,@bbcmtd Wholesale Markets ablaze http://t.co/l...
32,ablaze,We always try to bring the heavy. #metal #RT h...
33,ablaze,#AFRICANBAZE: Breaking news:Nigeria flag set a...
34,ablaze,Crying out for more! Set me ablaze
35,ablaze,On plus side LOOK AT THE SKY LAST NIGHT IT WAS...
...,...,...
7578,wrecked,@jt_ruff23 @cameronhacker and I wrecked you both
7579,wrecked,Three days off from work and they've pretty mu...
7580,wrecked,#FX #forex #trading Cramer: Iger's 3 words tha...
7581,wrecked,@engineshed Great atmosphere at the British Li...


In [8]:
len(train_df['location'].unique())

3342

**Dropping the features: keyword and location**

In [9]:
train_df.drop(columns=['keyword', 'location'], inplace=True)

**Data Cleaning and Preprocessing**

In [10]:
tokenizer = TweetTokenizer()
spacy_en = spacy.load('en_core_web_sm', disable=['ner', 'parser'])

In [11]:
def preprocess_text(text):
  cleaned_text = text.lower()
  
  #remove words like hashtags, web addresses
  cleaned_text = re.sub(r'#\S+|@\S+|<.*?>|http\S+|[0-9,.]+\S+', '', cleaned_text)

  cleaned_text = tokenizer.tokenize(cleaned_text)
  cleaned_text = ' '.join([word for word in cleaned_text if word not in string.punctuation and word not in stopwords.words('english')])

  doc = spacy_en(cleaned_text)
  cleaned_text = [token.lemma_ for token in doc if token.lemma_ not in ['-PRON-'] and token.pos_ not in ['DET','-PRON']]
  return cleaned_text

In [12]:
train_df['cleaned_text'] = train_df.apply(lambda row : (preprocess_text(row['text']), row['target']), axis=1)

In [13]:
train_tweets = train_df['cleaned_text'].map(lambda elem: elem[0])

In [14]:
def get_all_words(all_tweets):
  all_words = []
  for elem in all_tweets:
    all_words.extend(elem)
  return all_words

def get_word_features(all_words):
  word_count = nltk.FreqDist(all_words)
  word_features = word_count.keys()
  return word_features

vocab = get_word_features(get_all_words(train_tweets))
print("Total unique features: ", len(vocab))

Total unique features:  11090


In [15]:
def extract_features(tweets):
  features = {}
  tweet_words = set(tweets)
  for word in vocab:
    features['contains(%s)' % word] = (word in tweet_words)
  return features

**Modelling**

In [16]:
slice_index = int(len(train_df) * 0.8)
messages_set = train_df['cleaned_text']
random.shuffle(messages_set)

train_messages, val_messages = messages_set[0:slice_index].tolist(), messages_set[slice_index:].tolist()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[i], x[j] = x[j], x[i]


In [17]:
training_set = nltk.classify.apply_features(extract_features, train_messages)
validation_set = nltk.classify.apply_features(extract_features, val_messages)

In [18]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Train accuracy: " , nltk.classify.accuracy(classifier, training_set))
print("Validation accuracy: ", nltk.classify.accuracy(classifier, validation_set))

Train accuracy:  0.9047619047619048
Validation accuracy:  0.7997373604727511


**Training on full messages set to be used for test prediction**

In [19]:
full_train_messages = messages_set.tolist()
full_training_set = nltk.classify.apply_features(extract_features, full_train_messages)
classifier = nltk.NaiveBayesClassifier.train(full_training_set)

**Prediction on test set to be evaluated**

In [20]:
test_df.drop(columns=['keyword', 'location'], inplace=True)
test_df['cleaned_text'] = test_df.apply(lambda row:preprocess_text(row['text']), axis=1)
test_df['cleaned_text'][0:5]

0                       [happen, terrible, car, crash]
1        [hear, different, city, stay, safe, everyone]
2    [forest, fire, spot, pond, geese, flee, across...
3                               [apocalypse, lighting]
4             [typhoon, soudelor, kill, china, taiwan]
Name: cleaned_text, dtype: object

In [21]:
test_predictions = test_df['cleaned_text'].apply(lambda tweet: classifier.classify(extract_features(tweet)))

In [22]:
test_predictions[0:5]

0    1
1    0
2    1
3    0
4    1
Name: cleaned_text, dtype: int64

In [23]:
submission = pd.DataFrame({'id':test_df.id, 'target':test_predictions})
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [24]:
submission.to_csv('./submission.csv', index=False)