In [None]:
import pandas as pd 
import matplotlib.pyplot as plt 
import nltk 
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize,sent_tokenize


: 

In [None]:
df =  pd.read_csv('../data/spam.csv', encoding='ISO-8859-1', usecols=[0, 1])
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)
df.shape

In [None]:
df.rename(columns={'v1':'target', 'v2':'text'}, inplace=True)
df['target'].replace({'ham':0, 'spam':1}, inplace=True)
print(df.head(5))

plt.pie(df['target'].value_counts(), labels=['ham', 'spam'], autopct="%0.2f")

In [None]:
# nltk.download('punkt')
# nltk.download('punkt_tab')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [None]:
df['num_chars'] = df['text'].apply(len)
df['num_words'] = df['text'].apply(lambda x: len(nltk.word_tokenize(x)))
df['num_sentences'] = df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
ps = PorterStemmer()
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)

    y = []
    for i in text:
        if i.isalnum():
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)

    text = y[:]
    y.clear()

    for i in text:
        y.append(ps.stem(i))

    return " ".join(y)

transform_text("Hello!!!, How are you? This is a sampling text.")

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

from collections import Counter
pd.DataFrame(Counter(spam_corpus).most_common(30), columns=['word', 'count'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer(max_features=3000)
X = tfv.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
y_pred = mnb.predict(X_test)
print("MultinomialNB")
print(accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(precision_score(y_test, y_pred))

In [None]:
import pickle

with open('../models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfv, f)

with open('../models/model.pkl', 'wb') as f:
    pickle.dump(mnb, f)