In [7]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas
import numpy as np

full_dataset = pandas.read_csv('spam_emails.csv', encoding='utf-8')      
full_dataset['label_num'] = full_dataset.label.map({'ham':0, 'spam':1})

np.random.seed(0)                                       
train_indices = np.random.rand(len(full_dataset)) < 0.7 

train = full_dataset[train_indices] 
test = full_dataset[~train_indices] 

pipeline = Pipeline([             
    ('tfidf', TfidfVectorizer(max_df=0.4)),
    ('clf', MultinomialNB()),
])

pipeline.fit(train['text'], train['label_num'])

accuracy = pipeline.score(test['text'], test['label_num'])
print("ACCURACY: {n}%".format(n=100.*accuracy))
print(classification_report(test['label_num'], pipeline.predict(test['text']))) 

ACCURACY: 88.4038199181446%
              precision    recall  f1-score   support

           0       0.86      1.00      0.92       517
           1       1.00      0.61      0.76       216

    accuracy                           0.88       733
   macro avg       0.93      0.80      0.84       733
weighted avg       0.90      0.88      0.87       733



In [2]:
text_to_predict = "NEED TO FIND SOMETHING? ::FREE MORTGAGE QUOTE:: To be removed from this list, click here. "
predicted = pipeline.predict([text_to_predict])
if predicted == 1:
    detected = 'SPAM'
else:
    detected = 'HAM'
print("Tekst \"{t}\", to: {d}".format(t=text_to_predict, d=detected))

Tekst "NEED TO FIND SOMETHING? ::FREE MORTGAGE QUOTE:: To be removed from this list, click here. ", to: SPAM


In [3]:
def load_embeddings(path):
    mapping = dict()
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            splitted = line.split(" ")
            mapping[splitted[0]] = np.array(splitted[1:], dtype=float)
    return mapping

mapping = load_embeddings('glove.6B.50d.txt') # http://nlp.stanford.edu/data/glove.6B.zip

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas
import numpy as np
from nltk import word_tokenize
from sklearn.metrics import classification_report

full_dataset = pandas.read_csv('spam_emails.csv', encoding='utf-8')     
full_dataset['label_num'] = full_dataset.label.map({'ham':0, 'spam':1})  

np.random.seed(0)                                      
train_indices = np.random.rand(len(full_dataset)) < 0.7 

train = full_dataset[train_indices] 
test = full_dataset[~train_indices] 


def documents_to_ave_embeddings(docs, embeddings):
    result = [] 
    for doc in docs:
        doc = doc.lower()
        tokens = word_tokenize(doc)
        i = 0
        value = np.zeros(50)
        for token in tokens:
            if token in embeddings:
                i += 1
                value += embeddings[token]
        value /= i
        result.append(value)
    return result
 
classifier = SVC(C=1.0)

train_transformed = documents_to_ave_embeddings(train['text'], mapping)
test_transformed = documents_to_ave_embeddings(test['text'], mapping)

classifier.fit(train_transformed, train['label_num']) 

accuracy = classifier.score(test_transformed, test['label_num'])
print("ACCURACY: {n}%".format(n=100.*accuracy))
print(classification_report(test['label_num'], classifier.predict(test_transformed))) 

ACCURACY: 91.81446111869032%
              precision    recall  f1-score   support

           0       0.93      0.95      0.94       517
           1       0.88      0.84      0.86       216

    accuracy                           0.92       733
   macro avg       0.91      0.89      0.90       733
weighted avg       0.92      0.92      0.92       733

