## NLP Based Text Spam Detector


In [1]:
#import libraries
import pandas as pd
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/abdullah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#https://www.kaggle.com/uciml/sms-spam-collection-dataset
df_train = pd.read_csv('spam_train.csv', encoding='ISO-8859-1')
df_train.head(5)

Unnamed: 0,sms,category
0,"Go until jurong point, crazy.. Available only ...",ham
1,Ok lar... Joking wif u oni...,ham
2,Free entry in 2 a wkly comp to win FA Cup fina...,spam
3,U dun say so early hor... U c already then say...,ham
4,"Nah I don't think he goes to usf, he lives aro...",ham


In [3]:
df_test = pd.read_csv('spam_test.csv', encoding='ISO-8859-1')
df_test.head(5)

Unnamed: 0,sms,category
0,Well its not like you actually called someone ...,ham
1,"Nope. Since ayo travelled, he has forgotten hi...",ham
2,You still around? Looking to pick up later,ham
3,CDs 4u: Congratulations ur awarded å£500 of CD...,spam
4,There's someone here that has a year &lt;#&gt...,ham


In [4]:
tokenizer=RegexpTokenizer('r\w+')
stopwords_english=set(stopwords.words('english'))

#Tokenizing & stemming & removing stop words
def cleanSms(sms):
 sms=sms.replace("<br /><br />"," ")
 sms=sms.lower()
 sms_tokens=tokenizer.tokenize(sms)
 sms_tokens_without_stopwords=[token for token in sms_tokens if token not in stopwords_english]
 stemmed_sms_tokens_without_stopwords=[PorterStemmer().stem(token) for token in sms_tokens_without_stopwords]
 cleaned_sms=' '.join(stemmed_sms_tokens_without_stopwords)
 return cleaned_sms

In [5]:
#Clean the data & plot it on X & Y
df_train['sms'].apply(cleanSms)
x_train = df_train['sms'].values
y_train = df_train['category'].values

df_test['sms'].apply(cleanSms)
x_test = df_test['sms'].values
y_test = df_test['category'].values

In [6]:
#Vectorize the data
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, encoding='ISO-8859-1')
vectorizer.fit(x_train)
x_train=vectorizer.transform(x_train)
x_test=vectorizer.transform(x_test)

In [7]:
#Create model
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='lbfgs')
model.fit(x_train,y_train)

LogisticRegression()

In [8]:
#Predict Spam
model.predict(vectorizer.transform(["you won $900 in the new lottery draw. Call +123456789."]))

array(['spam'], dtype=object)

In [9]:
#Predict Ham
model.predict(vectorizer.transform(["Hello there. How are you doing?"]))

array(['ham'], dtype=object)

# Pickle operation

In [11]:
import joblib
joblib.dump(model,'myflask/spam_ham_model.pkl')
joblib.dump(vectorizer,'myflask/vectorizer.pkl')

['myflask/vectorizer.pkl']