In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import pickle
from sklearn.feature_extraction.text import CountVectorizer
from symbols_vectorizer import SymbolsVectorizer
from data_preprocessing_funcs import get_clean_mail, get_special_symbols, vectorize_mail

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/osiprovin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# reading data
mails = pd.read_csv("/Users/osiprovin/Downloads/spam.csv", encoding='latin1')

In [3]:
# inspecting data
mails

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


Data preparing

In [4]:
# dropping unnecessary columns 
mails.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], inplace=True, axis=1)
# renaming columns
mails.columns = ["spam", "mail"]

Data preprocessing

In [5]:
# # setting spam as 1 and ham as -1
mails["spam"].replace("ham", -1, inplace=True)
mails["spam"].replace("spam", 1, inplace=True)
# creating special symbols column
mails["special symbols"] = mails["mail"].apply(get_special_symbols)
# cleaning mail text to remain only essential english words
mails["mail"] = mails["mail"].apply(get_clean_mail)

In [6]:
# splitting data for train and test parts
train_mails, test_mails = train_test_split(mails, test_size=0.2, shuffle=True)

In [7]:
# creating and fitting vectorizers
text_vectorizer = CountVectorizer()
special_symbols_vectorizer = SymbolsVectorizer()

text_vectorizer = text_vectorizer.fit(train_mails["mail"])
special_symbols_vectorizer = special_symbols_vectorizer.fit(train_mails["mail"])

In [8]:
# vectorizing and concatinating special symbols and mail with vectorize_mail function
trainx = vectorize_mail(train_mails["mail"].tolist(), train_mails["mail"].tolist(), text_vectorizer, special_symbols_vectorizer)
testx = vectorize_mail(test_mails["mail"].tolist(), test_mails["special symbols"].tolist(), text_vectorizer, special_symbols_vectorizer)

trainy = train_mails["spam"].tolist()
testy = test_mails["spam"].tolist()

In [9]:
# saving data and vectorizers

data = {"trainx": trainx,
        "trainy": trainy,
        "testx": testx,
        "testy": testy}

with open("/Users/osiprovin/freelance_1/Spam Filter Project/Data/data.pkl", "wb") as f:
    pickle.dump(data, f)



vectorizers = {"text_vectorizer": text_vectorizer,
               "ssymbols_vectorrizer": special_symbols_vectorizer}

with open("/Users/osiprovin/freelance_1/Spam Filter Project/Data/vectorizers.pkl", "wb") as f:
    pickle.dump(vectorizers, f)