*Update V3 : Fixed calculation for LOGPRIOR value in the function create_naive_bayes_map*

In [1]:
import numpy as np
import pandas as pd 
import tqdm.notebook as tqdm
import re
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords         
from nltk.stem import PorterStemmer  

In [2]:
tqdm.tqdm_notebook.pandas()
pd.set_option('display.max_colwidth', None)

In [3]:
STOP_WORDS = stopwords.words('english') 
PUNCTUATIONS = '"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

## Load Data
***

In [4]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [5]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are being notified by officers. No other evacuation or shelter in place orders are expected,1
3,6,,,"13,000 people receive #wildfires evacuation orders in California",1
4,7,,,Just got sent this photo from Ruby #Alaska as smoke from #wildfires pours into a school,1


In [6]:
def clean_text(text):
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'https?://[^\s\n\r]+', '', text)
    text = re.sub(r'#', '', text)
    return text

def remove_stop_words_and_puntuation(tokenized_text):
    text_clean = []
    
    for word in tokenized_text:
        if (word not in PUNCTUATIONS and word not in STOP_WORDS):
            text_clean.append(word)
            
    return text_clean
    
def stemm_text(tokenized_text):
    text_stemm = []
    
    stemmer = PorterStemmer()
    
    for word in tokenized_text:
        text_stemm.append(stemmer.stem(word))
        
    return text_stemm

def process_text(text):
    text = clean_text(text)
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    text = tokenizer.tokenize(text)
    
    text = remove_stop_words_and_puntuation(text)
    text = stemm_text(text)
            
    return text

def create_frequency_map(data: pd.DataFrame, process_text_enabled = False):
    data_dict = data.to_dict()

    frequency_map = {}

    for idx in tqdm.tqdm_notebook(range(len(data_dict["id"]))):
        if process_text_enabled == True:
            words = process_text(data_dict["text"][idx])
        else:
            words = data_dict["text"][idx]
            
        for word in words:
            target = data_dict["target"][idx]
            
            if (word.lower(), target) in frequency_map:
                frequency_map[(word.lower(), target)] += 1
            else:
                frequency_map[(word.lower(), target)] = 1

    return frequency_map
    
def process_train_data(train_data : pd.DataFrame):
    data = train_data.copy()
    
    data['processed_text'] = data['text'].progress_apply(lambda x : process_text(x)) 
    
    return data

def create_naive_bayes_map(data : pd.DataFrame, process_text_enabled):
    
    data_dict = data.to_dict()

    naive_bayes_map = {}

    frequency_map = create_frequency_map(data, process_text_enabled)
    total_pos = 0.0
    total_neg = 0.0
    unique_words = {}
   
    for key in frequency_map:
        if key[1] == 1:
            total_pos += frequency_map[key]
        else:
            total_neg += frequency_map[key]
        if key[0] not in unique_words:
            unique_words[key[0]] = []
            
    total_unique_words = len(unique_words)
    
    total_target_pos = 0
    total_target_neg = 0
    
    for idx in data_dict['target']:
        if data_dict['target'][idx] == 1:
            total_target_pos += 1
        else:
            total_target_neg += 1
    
    for key in frequency_map:

        if key[0] not in naive_bayes_map:
                naive_bayes_map[key[0]] = {'pos': 0.0, 'neg': 0.0, 
                                           'pos_smooth' : 1 / (total_pos + total_unique_words), 
                                           'neg_smooth' : 1 / (total_neg + total_unique_words) }

        if key[1] == 1:
            naive_bayes_map[key[0]]['pos'] = frequency_map[key] /  total_pos
            naive_bayes_map[key[0]]['pos_smooth'] = calculate_laplacian_smoothing(frequency_map[key], total_pos, total_unique_words)
            
        else:
            naive_bayes_map[key[0]]['neg'] = frequency_map[key] / total_neg
            naive_bayes_map[key[0]]['neg_smooth'] = calculate_laplacian_smoothing(frequency_map[key], total_neg, total_unique_words)
            
    
    for key in naive_bayes_map:
        word_lambda = np.log(naive_bayes_map[key]['pos_smooth'] / naive_bayes_map[key]['neg_smooth'])
        naive_bayes_map[key]['lambda'] = word_lambda
    
    log_prior = np.log(total_target_pos/total_target_neg)
    
    sum_pos = 0
    sum_neg = 0
    sum_pos_smooth = 0
    sum_neg_smooth = 0                         
                             
    for key in naive_bayes_map:
        sum_pos += naive_bayes_map[key]['pos']
        sum_neg += naive_bayes_map[key]['neg']
        sum_pos_smooth += naive_bayes_map[key]['pos_smooth']
        sum_neg_smooth += naive_bayes_map[key]['neg_smooth']
        
    
    
    print(f'POS : {total_pos}, NEG :{total_neg}, Unique_words : {total_unique_words}, LOG_PRIOR : {log_prior}')
    print(f'SUM_POS : {sum_pos}, SUM_NEG : {sum_neg}, SUM_POS_SMOOTH : {sum_pos_smooth}, SUM_NEG_SMOOTH : {sum_neg_smooth}')
        
   
    return naive_bayes_map, log_prior
 
def calculate_laplacian_smoothing(freq_value, total , total_unique_words):
    return (freq_value + 1) / (total + total_unique_words)

def sigmoid(z): 
    
    h =  1/(1 + np.exp(-z))
    
    return h

In [7]:
def predict(text, process_text_enabled = True):
    
    if process_text_enabled == True:
        text = process_text(text)
    
    score = 0
    for word in text:
        if word in NAIVE_BAYES_MAP:
            score += NAIVE_BAYES_MAP[word]['lambda']
        
    return score + LOG_PRIOR

## Create Naïve Bayes Map
***

In [8]:
NAIVE_BAYES_MAP, LOG_PRIOR = create_naive_bayes_map(train_data, process_text_enabled=True)

  0%|          | 0/7613 [00:00<?, ?it/s]

POS : 33532.0, NEG :39255.0, Unique_words : 12458, LOG_PRIOR : -0.28323932289985326
SUM_POS : 0.9999999999998929, SUM_NEG : 0.9999999999998592, SUM_POS_SMOOTH : 1.000000000000214, SUM_NEG_SMOOTH : 1.0000000000001898


In [9]:
df_naive_bayes_map = pd.DataFrame(NAIVE_BAYES_MAP)

In [10]:
df_naive_bayes_map.T.head()

Unnamed: 0,pos,neg,pos_smooth,neg_smooth,lambda
deed,3e-05,2.5e-05,4.3e-05,3.9e-05,0.117285
reason,0.000239,0.000586,0.000196,0.000464,-0.863544
earthquak,0.001402,0.000153,0.001044,0.000135,2.042576
may,0.001491,0.000968,0.001109,0.000754,0.385549
allah,0.000179,7.6e-05,0.000152,7.7e-05,0.676901


In [11]:
df_naive_bayes_map.T.describe()

Unnamed: 0,pos,neg,pos_smooth,neg_smooth,lambda
count,12458.0,12458.0,12458.0,12458.0,12458.0
mean,8e-05,8e-05,8e-05,8e-05,-0.031458
std,0.000331,0.000324,0.000241,0.000246,0.878551
min,0.0,0.0,2.2e-05,1.9e-05,-2.715928
25%,0.0,0.0,2.2e-05,1.9e-05,-0.575862
50%,3e-05,2.5e-05,4.3e-05,3.9e-05,-0.575862
75%,3e-05,5.1e-05,4.3e-05,5.8e-05,0.810432
max,0.018818,0.020634,0.013742,0.015683,4.407745


## Words most likely to denote Disaster
***

In [12]:
df_naive_bayes_map.T.sort_values(by=['lambda'], ascending=False).head(10)

Unnamed: 0,pos,neg,pos_smooth,neg_smooth,lambda
mh370,0.002147,0.0,0.001587,1.9e-05,4.407745
northern,0.001909,0.0,0.001413,1.9e-05,4.291672
legionnair,0.001819,0.0,0.001348,1.9e-05,4.24442
debri,0.001491,0.0,0.001109,1.9e-05,4.049111
migrant,0.001431,0.0,0.001065,1.9e-05,4.009106
hiroshima,0.002744,2.5e-05,0.002022,3.9e-05,3.956738
mosqu,0.001044,0.0,0.000783,1.9e-05,3.700804
pkk,0.000924,0.0,0.000696,1.9e-05,3.583021
bomber,0.001819,2.5e-05,0.001348,3.9e-05,3.551272
16yr,0.000835,0.0,0.000631,1.9e-05,3.484581


## Words less likely to denote Disaster
***

In [13]:
df_naive_bayes_map.T.sort_values(by=['lambda'], ascending=True).head(10)

Unnamed: 0,pos,neg,pos_smooth,neg_smooth,lambda
career,0.0,0.000408,2.2e-05,0.000329,-2.715928
lmao,0.0,0.000408,2.2e-05,0.000329,-2.715928
bag,0.000179,0.002828,0.000152,0.002166,-2.655304
charact,0.0,0.000357,2.2e-05,0.00029,-2.590765
ebay,3e-05,0.000739,4.3e-05,0.00058,-2.590765
loui,0.0,0.000306,2.2e-05,0.000251,-2.447664
ticket,0.0,0.000306,2.2e-05,0.000251,-2.447664
handbag,3e-05,0.000611,4.3e-05,0.000483,-2.408443
cake,3e-05,0.000586,4.3e-05,0.000464,-2.367621
welcom,0.0,0.00028,2.2e-05,0.000232,-2.367621


## Test
***

In [14]:
idx = 15
target = train_data['target'][idx]
text = train_data['text'][idx]
probability = predict(text)

print(f'{target} : {sigmoid(probability)} - {text}')

0 : 0.20333914769043357 - What's up man?


## Submission
***

In [15]:
def create_submission(data : pd.DataFrame):
    data_dict = test_data.to_dict()
    
    submission = {'id': [], 'target' :[]}
    
    for idx in tqdm.tqdm_notebook(range(len(data_dict["id"]))):
        submission['id'].append(data_dict["id"][idx])
        
        pred = 1 if sigmoid(predict(data_dict["text"][idx])) >=0.5 else 0
        
        submission['target'].append(pred)
            
    return submission

In [16]:
submission = create_submission(test_data)

  0%|          | 0/3263 [00:00<?, ?it/s]

In [17]:
df_submission = pd.DataFrame(submission)
df_submission.to_csv('submission.csv', index = False)

In [18]:
df_submission

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
