In [41]:
from pandas import DataFrame
from PersianStemmer import PersianStemmer
import collections
import pandas as pd

In [42]:
ps = PersianStemmer()

extra_words = []
with open('persian-stopwords/short', encoding='UTF-8') as my_file:
    for line in my_file:
        extra_words.append(line.replace('\n', ''))

extra_chars = []
with open('persian-stopwords/chars', encoding='UTF-8') as my_file:
    for line in my_file:
        extra_chars.append(line.replace('\n', ''))

In [43]:
def remove_extra_words(string):
    string = str(string)
    string = ps.run(string)
    for char in extra_chars:
        string = string.replace(char, ' ')

    edit_string_as_list = string.split()

    return [word for word in edit_string_as_list if word not in extra_words]
    # return edit_string_as_list

In [44]:
def train():
    trainData = pd.read_csv('train.csv')
    print("*************** read csv ***************")

    trainData['title'] = trainData['title'].apply(remove_extra_words)
    print("*************** remove extra title *****")

    trainData['comment'] = trainData['comment'].apply(remove_extra_words)
    print("*************** remove extra comment ***")

    verified_title = []
    spam_title = []

    verified_comment = []
    spam_comment = []

    verified_rate = []
    spam_rate = []

    for index, row in trainData.iterrows():
        title = row['title']
        comment = row['comment']
        rate = row['rate']

        if row['verification_status'] == 1:
            verified_title.extend(title)
            verified_comment.extend(comment)
            verified_rate.append(rate)
        else:
            spam_title.extend(title)
            spam_comment.extend(comment)
            spam_rate.append(rate)
    print("*************** spam and ver appended **")

    all_title = verified_title + spam_title
    all_comment = verified_comment + spam_comment
    all_rate = verified_rate + spam_rate
    print("*************** all appended ***********")

    verified_title = collections.Counter(verified_title)
    verified_comment = collections.Counter(verified_comment)
    verified_rate = collections.Counter(verified_rate)
    print("*************** verified count *********")

    spam_title = collections.Counter(spam_title)
    spam_comment = collections.Counter(spam_comment)
    spam_rate = collections.Counter(spam_rate)
    print("*************** spam count *************")

    all_title = collections.Counter(all_title)
    all_comment = collections.Counter(all_comment)
    all_rate = collections.Counter(all_rate)
    print("*************** all count **************")

    return (all_title, verified_title, spam_title), (all_comment, verified_comment, spam_comment), (
        all_rate, verified_rate, spam_rate)

In [45]:
def test():
    test_data = pd.read_csv('test.csv')
    test_data['title'] = test_data['title'].apply(remove_extra_words)
    test_data['comment'] = test_data['comment'].apply(remove_extra_words)

    return test_data

In [46]:
(allTitle, verifiedTitle, spamTitle), (allComment, verifiedComment, spamComment), (
    allRate, verifiedRate, spamRate) = train()

print('############### Train pre process ###############')

*************** read csv ***************
*************** remove extra title *****
*************** remove extra comment ***
*************** spam and ver appended **
*************** all appended ***********
*************** verified count *********
*************** spam count *************
*************** all count **************
############### Train pre process ###############


In [47]:
testData = test()

print('############### Test pre process  ###############')

############### Test pre process  ###############


In [48]:
verTitleSum = sum(verifiedTitle.values())
spamTitleSum = sum(spamTitle.values())

verCommentSum = sum(verifiedComment.values())
spamCommentSum = sum(spamComment.values())

verRateSum = sum(verifiedRate.values())
spamRateSum = sum(spamRate.values())

titleLen = len(allTitle)
allCommentLen = len(allComment)
rateLen = len(allRate)

answer_id = []
answer_verification_status = []

In [52]:
def bayes(data, target: dict, _sum, length):
    target_get = target[data] + 4
    return target_get / (_sum + length)

In [53]:
for index, row in testData.iterrows():
    title = row['title']
    comment = row['comment']
    rate = row['rate']
    verified = 1
    spam = 1
    for t in title:
        verified *= bayes(t, verifiedTitle, verTitleSum, titleLen)
        spam *= bayes(t, spamTitle, spamTitleSum, titleLen)
    for c in comment:
        verified *= bayes(c, verifiedComment, verCommentSum, allCommentLen)
        spam *= bayes(c, spamComment, spamCommentSum, allCommentLen)
    verified *= bayes(rate, verifiedRate, verRateSum, rateLen)
    spam *= bayes(rate, spamRate, spamRateSum, rateLen)

    verification_status = 1 if verified > spam else 0

    id = row['id']
    answer_id.append(id)
    answer_verification_status.append(verification_status)

print('############### Answer generated  ###############')


answer = {'id': answer_id, 'verification_status': answer_verification_status}
df = DataFrame(answer, columns=['id', 'verification_status'])
export_csv = df.to_csv('ans.csv', index=False)

print('############### ans.csv generated  ###############')




############### Answer generated  ###############
############### ans.csv generated  ###############
