# Multinomial Naive Bayes - Comment Verification / Spam Detector

In [1]:
import pandas as pd
from collections import defaultdict

#import library for stemming pesian words called PersianStemmer
# install with : pip install PersianStemmer
#more details : https://pypi.org/project/PersianStemmer/
from PersianStemmer import PersianStemmer 


## init variables:

In [None]:
#load stopwords from text file from stopwords-fa.txt (persian stopwords)
fileAddress='./bin/persian-stopwords/stopwords-fa.txt'
with open(fileAddress,encoding='utf-8') as file:
    sw=file.read().splitlines()
sw=sw[1:]

In [None]:
#make instance of persian stemmer
ps = PersianStemmer()

In [None]:
# read train data (supervised data) from csv file
data= pd.read_csv('./bin/train.csv')

In [5]:
# read test data (unsupervised data) from csv file
test= pd.read_csv('./bin/test.csv')

## function defenitions:

###### remove punctuations

In [6]:
#passed a string and return that string with no punctuations (replaced by white space " ") 
def remove_punctuation(text):
    text=str(text)
    import string
    punct = string.punctuation + '.،!"#$%&|×/:؛,][\}{«»<>؟'

    return text.translate({ord(p): " " for p in punct})

##### remove stopwords

In [7]:
#passed a string and return that string with no stopwords (replaced by white space " ")
def stopwords(text):
    '''a function for removing the stopword'''
    # removing the stop words and lowercasing the selected words
    text = [word for word in text.split() if word not in sw]
    # joining the list of words with space separator
    return " ".join(text)

###### count(word,class)


In [8]:
# count number of each word in specifc class (classes: spam = 1 , notspam = 0)
def count_wc(word,c):
    if c==0 :
        return not_spam_vocab[word]
    elif c==1 : 
        return spam_vocab[word]
    else:
        return 0

###### conditional_Prob ==> P ( word | class )

In [2]:
'''
    measure of the probability of an event (specific word) occurring 
    given that another event (specific class) has (by assumption, presumption,
    assertion or evidence) occurred.
    conditional prob will calculate by this formula: 
    [(count of specific word in that class)+1]/[count of all words in that class + count of distinct words in whole document]
    note : 1 in dividend is for smoothing (optional but important)
    note : count of distinct words in whole document is for scaling (optional)
    classes: spam = 1 , notspam = 0
'''
def cond_prob (word, c):
    if c==0:
        return (count_wc(word,c)+1)/(word_count+notspam_word_count)
    elif c==1:
        return (count_wc(word,c)+1)/(word_count+spam_word_count)
    else:
        return 0;

###### status detector (verifying comment or not)

In [10]:
# specifies which class's probability is larger for specific comment. 
# classes: spam = 1 , notspam = 0
def status_detector(comment):
    return 1 if prob(0, comment)< prob(1,comment) else 0

##### P ( Class | Comment )

In [3]:
# each class's probability in condition of a specific comment occurs
# classes: spam = 1 , notspam = 0
def prob (c , comment):

    words= str(comment).split()
    class_prob = spam_prob if c==1 else NotSpam_prob
    #prob(document|class)
    result =class_prob
    for word in words:
        if(cond_probs[word]):
            result *= cond_probs[word][c]
    return result
        

##### Stemming

In [12]:
# Do Stemming for an array of words and return a string
# by concatenating array's words by whitespace
def stemming (arr):
    for idx,val in enumerate(arr):
        arr[idx] = ps.run(val)
    return (" ".join(arr))

##### Pre processing

In [13]:
# Do preprocess for dataframe
# including removing punctuations, removing stopwords and stemming for each comment
def preProcess(df):
    df['comment'] = df['comment'].apply(remove_punctuation)
    df['comment'] = df['comment'].apply(stopwords)   
    for i in range(df.shape[0]):
        df.at[i, 'comment'] = stemming(str(df.at[i,'comment']).split())
    return df

## Core Code:

In [14]:
# fill null spaces with whitespace (" ")
data.fillna(' ',inplace=True)

In [15]:
# concat title and coments of each entry in train data set
data['comment'] = data['title'] +" "+ data['comment']

In [16]:
# do pre processing on dataframe
data = preProcess(data)

In [17]:
#make new dataframe from Comments and status (others are not important)
data=data[['comment','verification_status']]

In [18]:
# tokenizing comments and put it in tokens column 
data['tokens'] = data['comment'].str.split()

In [19]:
#make a dictionary of distinct words in train data from word to count of word
# key : word
# value : count of word in whole train dataset
vocab=defaultdict(int)
for comment in data['comment'].values:
    for elem in comment.split(' '):
            vocab[elem]+=1

In [20]:
#number of uniq words in train data set
word_count = len(vocab)

In [21]:
# being spam probability Due to the dataset
spam_prob = data[data['verification_status']==1].shape[0]/data.shape[0]

In [22]:
# being notspam probability Due to the dataset
NotSpam_prob = 1-spam_prob

In [23]:
# data frame of spams
spam_df = data[data['verification_status']==1]

In [24]:
# data frame of notspams
nspam_df = data[data['verification_status']==0]

In [25]:
# Now we'll create a spam vocabulary for the training set from word to count of word in spam dataset
# key : word
# value : count of word in spam dataset
spam_vocab=defaultdict(int) 
for comment in spam_df['comment'].values:
    for elem in comment.split(' '):
            spam_vocab[elem]+=1

In [26]:
# Now we'll create a spam vocabulary for the training set from word to count of word in notspam datasetnot_spam_vocab=defaultdict(int)
# key : word
# value : count of word in notspam dataset
for comment in nspam_df['comment'].values:
    for elem in comment.split(' '):
            not_spam_vocab[elem]+=1

In [27]:
#number of words in not spam
notspam_word_count=0
for key in not_spam_vocab.keys():
    notspam_word_count += not_spam_vocab[key]

In [28]:
#number of words in spam
spam_word_count=0
for key in spam_vocab.keys():
    spam_word_count += spam_vocab[key]

In [29]:
#dictionary of probability of each word due to spam condition or not spam condition
#key : word
#value : ( P(word | notspam) , P(word | spam) )
cond_probs=defaultdict(int) 
for word in vocab.keys():
    cond_probs[word] = (cond_prob(word,0), cond_prob(word,1))

## test

In [30]:
# concat title and coments of each entry in test data set
test['text']= test['title']+ ' ' + test['comment']

In [31]:
# remove punctuations of test dataset
# note: you can do all preprocess rutine on test data set 
#       like remove punctuations, stemming and removing stopwords
test['text'] = test['text'].apply(remove_punctuation)

In [33]:
# stemming on test dataset
for i in range(test.shape[0]):
    test.at[i, 'comment'] = stemming(str(test.at[i,'comment']).split())

In [34]:
#check the verification status using status_detector function
verification_status=[]
for comment in test['text']:
    verification_status.append(status_detector(comment))

In [35]:
#make a dataframe of entry ids and verification status (spam/notspam)
df = pd.DataFrame(list(zip(test['id'].values, verification_status)), columns =['id', 'verification_status']) 

In [36]:
#save the final data frame to ans.csv
df.to_csv(r'./ans.csv', index=False)