## Step-1: Business Probelm Understanding
#### Classify the sms whether it's a spam or normal

## Step-2: Data Understanding

In [72]:
import pandas as pd

In [73]:
## Loading Flat File
df=pd.read_csv("SMSSpamCollection",sep='\t',names=['label','message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [74]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [75]:
df['label'].unique()

array(['ham', 'spam'], dtype=object)

In [76]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [77]:
#Prior probability
df['label'].value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

## Step-3: Text Preprocessing(Cleaning+Vectorization)


### Text Cleaning
- **Remove Punctuation**
- **Remove Stopwords**
- **Stemming**

In [78]:
for i in range(len(df)):
    df['message'][i]

In [79]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

ps=PorterStemmer()

corpus=[]

for i in range(len(df)):
    rp=re.sub('[^a-zA-Z]'," ",df['message'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[ps.stem(word) 
        for word in rp if not word in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus.append(rp)


#### Vectorization

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()

In [81]:
X.shape

(5572, 6296)

In [82]:
df['label'].replace({'ham':0,'spam':1},inplace=True)

In [83]:
y=df['label']

In [84]:
y.shape

(5572,)

In [85]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,
                                               random_state=0)

## Step-4: Modelling
### Naive Bayes Classifier

In [86]:
from sklearn.naive_bayes import MultinomialNB   #Import ML ALGO

model= MultinomialNB()                          #Save as Model Name

model.fit(X_train,y_train)                      # Fit on Train Data

In [87]:
y_train.value_counts()

0    3870
1     587
Name: label, dtype: int64

###### After vectorization we have 6296 words so probability will be calculated for each word wrt ham or spam (i.e;P(Word/HAM) or P(Word/SPAM))

## Step-5: Evaluation
###### Prediction on new data

In [88]:
#Predictions
ypred_test=model.predict(X_test)

from sklearn.metrics import accuracy_score
print("Test Accuracy:",accuracy_score(y_test,ypred_test))

Test Accuracy: 0.979372197309417


## Step-6: Model Selection
##### Checking  the predicted values with original values

In [89]:
ypred_train=model.predict(X_train)
print("Train Accuracy:",accuracy_score(y_train,ypred_train))

Train Accuracy: 0.9921471842046219


In [90]:
from sklearn.model_selection import cross_val_score
print("Cross Validation Score:",cross_val_score(model,X_train,y_train,
                                                cv=5).mean())

Cross Validation Score: 0.9777885984911396


## Prediction on New Data

In [91]:
input_mail='Hi Sir, upload NLP Notes'

###### Load Data

In [92]:
#Convert to dataframe
df_test=pd.DataFrame({"message":input_mail},index=[0])
df_test

Unnamed: 0,message
0,"Hi Sir, upload NLP Notes"


###### Preprocessing the data

In [93]:
corpus=[]

for i in range(len(df_test)):
    rp=re.sub('[^a-zA-Z]'," ",df['message'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[ps.stem(word) 
        for word in rp if not word in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus.append(rp)


## Text Vectorization
X=cv.transform(corpus).toarray()

In [94]:
X.shape    #6296 columns

(1, 6296)

###### Prediction

In [95]:
pred=model.predict(X)

if pred==0:
    print("HAM")
else:
    print("SPAM")

HAM


In [96]:
input_mail1=' you have won a lottery of 1cr'

In [97]:
#Convert to dataframe
df_test1=pd.DataFrame({"message":input_mail1},index=[0])
df_test1

Unnamed: 0,message
0,you have won a lottery of 1cr


In [100]:
corpus=[]

for i in range(len(df_test1)):
    rp=re.sub('[^a-zA-Z]'," ",df['message'][i])
    rp=rp.lower()
    rp=rp.split()
    rp=[ps.stem(word) 
        for word in rp if not word in set(stopwords.words('english'))]
    rp=" ".join(rp)
    corpus.append(rp)


## Text Vectorization
X=cv.transform(corpus).toarray()

In [101]:
pred=model.predict(X)

if pred==0:
    print("HAM")
else:
    print("SPAM")

HAM


#### HAM & SPAM that can be differentiated by the words if the words mostly repeted in HAM means HAM and if SPAM means SPAM