In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
data = pd.read_csv('train_test.csv')

In [2]:
import nltk
MY_STOP_WORDS = ['u','dont']
def normalize(inp):
    wnl = nltk.stem.wordnet.WordNetLemmatizer()
    stop_words = set(nltk.corpus.stopwords.words('english') + MY_STOP_WORDS)
    tokenizer = nltk.tokenize.RegexpTokenizer('\w+')

    tokens = tokenizer.tokenize(inp)
    tokens = [token.lower() for token in tokens]

    filtered = []
    for token,pos in nltk.pos_tag(tokens):
        if token not in stop_words:
            pos = pos[0].lower()
            if pos in ['a','n','v']:
                filtered.append(wnl.lemmatize(token,pos))
            else:
                filtered.append(wnl.lemmatize(token))
    return filtered
class word_counter:
    def __init__(self):
        self.dic = {}
    def add_spam(self,msg):
        for word in msg:
            if word in self.dic:
                self.dic[word][1] += 1
            else:
                self.dic[word] = [0,1]
    def add_ham(self,msg):
        for word in msg:
            if word in self.dic:
                self.dic[word][0] += 1
            else:
                self.dic[word] = [1,0]
    def divide_spam(self,length):
        for word,count in self.dic.items():
            count[1] /= length
    def divide_ham(self,length):
        for word,count in self.dic.items():
            count[0] /= length
    def get_probability(self,word):
        if word in self.dic:
            return self.dic[word]
        return 0,0
    def get_ham_probability(self,msg):
        ham = 0
        spam = 0
        for word in msg:
            res = self.get_probability(word)
            ham += res[0]
            spam += res[1]
        if ham == spam:
            return 0.5
        return ham/(ham+spam)

In [3]:
_hams  = data[data.type ==  'ham'].reset_index(drop=True)
_spams = data[data.type ==  'spam'].reset_index(drop=True)
hams  = _hams.apply(lambda x: normalize(x['text']), axis=1)
spams = _spams.apply(lambda x: normalize(x['text']), axis=1)

In [4]:
hams_train  = hams.loc[:int(len(hams.index)*4/5)]
spams_train = spams.loc[:int(len(spams.index)*4/5)]

In [5]:
hams_test  = hams.loc[int(len(hams.index)*4/5):]
spams_test = spams.loc[int(len(spams.index)*4/5):]

In [6]:
list_counter = word_counter()
hams_train.apply(lambda x: list_counter.add_ham(x))
spams_train.apply(lambda x: list_counter.add_spam(x))

list_counter.divide_ham(len(hams_train.index))
list_counter.divide_spam(len(spams_train.index))

In [7]:
class check_counter:
    def __init__(self):
        self.h = 0
        self.s = 0
    def add_state(self,x):
        if(x>=0.5):
            self.h+=1
        else:
            self.s+=1
    def print_all(self):
        print ("ham: {} spam: {}".format(self.h,self.s))
hams_counter = check_counter()
hams_test.apply(lambda x:hams_counter.add_state(list_counter.get_ham_probability(x)))
hams_counter.print_all()

ham: 563 spam: 317


In [8]:
spams_counter = check_counter()
spams_test.apply(lambda x:spams_counter.add_state(list_counter.get_ham_probability(x)))
spams_counter.print_all()

ham: 1 spam: 135


In [9]:
hams_train_length  = hams_train.apply(lambda x: len(x))
spams_train_length = spams_train.apply(lambda x: len(x))

In [10]:
hams_stat  = hams_train_length.mean(),hams_train_length.std()
spams_stat = spams_train_length.mean(),spams_train_length.std()

In [11]:
def cacl_traz(inp,stat):
    return abs(inp - stat[0])/stat[1]
def calc_ham_length_probability(length):
    ht = cacl_traz(length,hams_stat)
    st = cacl_traz(length,spams_stat)
    return (st)/(ht+st)

In [12]:
def calc_probe(x):
    return (list_counter.get_ham_probability(x)+calc_ham_length_probability(len(x)))/2
hams_counter = check_counter()
hams_test.apply(lambda x:hams_counter.add_state(calc_probe(x)))
hams_counter.print_all()

ham: 726 spam: 154


In [13]:
spams_counter = check_counter()
spams_test.apply(lambda x:spams_counter.add_state(calc_probe(x)))
spams_counter.print_all()

ham: 4 spam: 132


In [14]:
print(spams_counter.s/(spams_counter.s+spams_counter.h))
print(spams_counter.s/(spams_counter.s+hams_counter.s))
print((spams_counter.s + hams_counter.h)/((hams_counter.s + hams_counter.h)+(spams_counter.s + spams_counter.h)))

0.9705882352941176
0.46153846153846156
0.844488188976378
