In [41]:
import pandas as pd
import numpy
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import stopwords

In [27]:
data = pd.read_csv(r'spam.csv',encoding='ISO-8859-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [28]:
data = data[['v1','v2']]
data = data.rename(columns = {'v2':'messages','v1':'label'})
data.head()

Unnamed: 0,label,messages
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Preprocessing  

In [29]:
data.isnull().sum()

label       0
messages    0
dtype: int64

In [31]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kpaps\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [37]:
stopword = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^0-9a-zA-Z]',' ',text)
    #remove extra spaces
    text = re.sub(r'\s+',' ',text)
    # remove stopwords
    text = " ".join(word for word in text.split() if word not in stopword)
    return text

In [38]:
# clean the messages
data['clean_text'] = data['messages'].apply(clean_text)
data

Unnamed: 0,label,messages,clean_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u 750 pound prize 2...
5568,ham,Will Ì_ b going to esplanade fr home?,b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like interested buying some...


### Input Split

In [39]:
X = data['clean_text']
y = data['label']

### Model Training

In [45]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer,TfidfVectorizer
from sklearn.metrics import classification_report

In [46]:
def classify(model,X,y):
    x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=42,shuffle=True,stratify=y)
    pipeline_model = Pipeline([('vect',CountVectorizer()),
                              ('Tfidf',TfidfTransformer()),
                              ('clf',model)])
    pipeline_model.fit(x_train,y_train)
    # cv_score = cross_val_score(model,X,y,cv=5)
    
    print('Accuracy: ', pipeline_model.score(x_test,y_test)*100)
    # print('CV Score: ',np.mean(cv_score)*100)

    y_pred = pipeline_model.predict(x_test)
    print(classification_report(y_test,y_pred))

In [47]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
classify(model,X,y)

Accuracy:  96.8413496051687
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1206
        spam       0.99      0.77      0.87       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.97      1393



In [48]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
classify(model,X,y)

Accuracy:  96.69777458722182
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.75      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.88      0.92      1393
weighted avg       0.97      0.97      0.96      1393



In [50]:
from sklearn.svm import SVC
model = SVC()
classify(model,X,y)

Accuracy:  97.91816223977028
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99      1206
        spam       1.00      0.84      0.92       187

    accuracy                           0.98      1393
   macro avg       0.99      0.92      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [52]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
classify(model,X,y)

Accuracy:  97.4156496769562
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1206
        spam       1.00      0.81      0.89       187

    accuracy                           0.97      1393
   macro avg       0.99      0.90      0.94      1393
weighted avg       0.97      0.97      0.97      1393

