## SMS Spam Classifier

In [None]:
import numpy as np
import pandas as pd

In [None]:
df=pd.read_csv('spam.csv')

In [None]:
df.head()

In [None]:
df.shape

### Steps to be followed:

- `Data cleaning`

- `EDA(Exploratory Data Analysis)`

- `Text pre-processing`

- `Model building`

- `Evaluation`

- `Improvement`

- `Website Conversion`

- `Deployment`

### Data Cleaning

In [None]:
df.info()

In [None]:
df.rename(columns={'label': 'target', 'message': 'text'}, inplace=True)

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

In [None]:
df['target']=encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# Missing values
df.isnull().sum()

In [None]:
# Duplicate values
df.duplicated().sum()

In [None]:
# Remove duplicates
df=df.drop_duplicates(keep='first')

In [None]:
df.head()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

### EDA

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham', 'spam'], autopct='%0.2f')
plt.show()

In [None]:
import nltk
nltk.download('punkt')

In [None]:
df['num_characters']=df['text'].apply(len)

In [None]:
df.head()

In [None]:
nltk.download('punkt_tab')

In [None]:
df['num_words']=df['text'].apply(lambda x: len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences']=df['text'].apply(lambda x: len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
# Ham messages
df[df['target']==0][['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
# Spam messages
df[df['target']==1][['num_characters', 'num_words', 'num_sentences']].describe()

In [None]:
import seaborn as sns

plt.figure(figsize=(12, 6))
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'], color='red')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'], color='red')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.histplot(df[df['target']==0]['num_sentences'])
sns.histplot(df[df['target']==1]['num_sentences'], color='red')
plt.show()

In [None]:
sns.pairplot(df, hue='target')

In [None]:
df.corr(numeric_only=True)

In [None]:
sns.heatmap(df.corr(numeric_only=True), annot=True)

### Text Pre-Processing

- `Lowercase`

- `Tokenization`

- `Removing special characters`

- `Removing stop words and punctuation`

- `Stemming`

In [None]:
nltk.download('stopwords')

In [None]:
import string

def transform_text(text):
    # Converting text to lower case
    text=text.lower()
    # Tokenizing text to words
    text=nltk.word_tokenize(text)
    
    # Removing special characters
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text=y[:]
    y.clear()
    
    # Removing stop words and punctuations
    for i in text:
        if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    
    text=y[:]
    y.clear()
    
    # Stemming
    from nltk.stem.porter import PorterStemmer
    ps=PorterStemmer()
    for i in text:
        y.append(ps.stem(i))
    
    return " ".join(y)

In [None]:
print(transform_text(df['text'][0]))

In [None]:
df['transformed_text']=df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
# Preparing word clouds
from wordcloud import WordCloud
wc=WordCloud(
    width=500,
    height=500,
    min_font_size=10,
    background_color='white'
)

In [None]:
spam_wc=wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(spam_wc)

In [None]:
ham_wc=wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.imshow(ham_wc)

In [None]:
spam_corpus=[]

for msg in df[df['target']==1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
spam_corpus_dict_freq=dict(Counter(spam_corpus).most_common(30))
print(spam_corpus_dict_freq)

In [None]:
spam_df=pd.DataFrame(Counter(spam_corpus).most_common(30))
spam_df.columns=['word', 'count']

sns.barplot(
    x='word',
    y='count',
    data=spam_df
)

plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus=[]

for msg in df[df['target']==0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
ham_corpus_dict_freq=dict(Counter(ham_corpus).most_common(30))
print(ham_corpus_dict_freq)

In [None]:
ham_df=pd.DataFrame(Counter(ham_corpus).most_common(30))
ham_df.columns=['word', 'count']

sns.barplot(
    x='word',
    y='count',
    data=ham_df
)

plt.xticks(rotation='vertical')
plt.show()

### Model Building

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv=CountVectorizer()

In [None]:
# x=cv.fit_transform(df['transformed_text']).toarray()
# y=df['target'].values

In [None]:
# print(x.shape)

In [None]:
# print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test=train_test_split(
#     x, y,
#     test_size=0.2,
#     random_state=2
# )

In [None]:
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB

gnb=GaussianNB()
mnb=MultinomialNB()
bnb=BernoulliNB()

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score

In [None]:
# gnb.fit(x_train, y_train)
# y_pred_gnb=gnb.predict(x_test)

# print(accuracy_score(y_test, y_pred_gnb))
# print(confusion_matrix(y_test, y_pred_gnb))
# print(precision_score(y_test, y_pred_gnb))

In [None]:
# mnb.fit(x_train, y_train)
# y_pred_mnb=mnb.predict(x_test)

# print(accuracy_score(y_test, y_pred_mnb))
# print(confusion_matrix(y_test, y_pred_mnb))
# print(precision_score(y_test, y_pred_mnb))

In [None]:
# bnb.fit(x_train, y_train)
# y_pred_bnb=bnb.predict(x_test)

# print(accuracy_score(y_test, y_pred_bnb))
# print(confusion_matrix(y_test, y_pred_bnb))
# print(precision_score(y_test, y_pred_bnb))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()

In [None]:
x_tfidf=tfidf.fit_transform(df['transformed_text']).toarray()
y_tfidf=df['target'].values

In [None]:
x_tfidf_train, x_tfidf_test, y_tfidf_train, y_tfidf_test=train_test_split(
    x_tfidf, y_tfidf,
    test_size=0.2,
    random_state=2
)

In [None]:
gnb.fit(x_tfidf_train, y_tfidf_train)
y_pred_gnb=gnb.predict(x_tfidf_test)

print(accuracy_score(y_tfidf_test, y_pred_gnb))
print(confusion_matrix(y_tfidf_test, y_pred_gnb))
print(precision_score(y_tfidf_test, y_pred_gnb))

In [None]:
mnb.fit(x_tfidf_train, y_tfidf_train)
y_pred_mnb=mnb.predict(x_tfidf_test)

print(accuracy_score(y_tfidf_test, y_pred_mnb))
print(confusion_matrix(y_tfidf_test, y_pred_mnb))
print(precision_score(y_tfidf_test, y_pred_mnb))

In [None]:
bnb.fit(x_tfidf_train, y_tfidf_train)
y_pred_bnb=bnb.predict(x_tfidf_test)

print(accuracy_score(y_tfidf_test, y_pred_bnb))
print(confusion_matrix(y_tfidf_test, y_pred_bnb))
print(precision_score(y_tfidf_test, y_pred_bnb))

- `Vecotorizer`: tfidf

- `Bayesian model`: mnb

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.svm import SVC
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import AdaBoostClassifier
# from sklearn.ensemble import BaggingClassifier
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.ensemble import GradientBoostingClassifier
# from xgboost import XGBClassifier

In [None]:
# svc=SVC(kernel='sigmoid', gamma=1.0)
# knc=KNeighborsClassifier()
# mnb=MultinomialNB()
# dtc=DecisionTreeClassifier(max_depth=5)
# lrc=LogisticRegression(solver='liblinear', penalty='l1')
# rfc=RandomForestClassifier(n_estimators=50, random_state=2)
# adc=AdaBoostClassifier(n_estimators=50, random_state=2)
# bgc=BaggingClassifier(n_estimators=50, random_state=2)
# etc=ExtraTreesClassifier(n_estimators=50, random_state=2)
# gbc=GradientBoostingClassifier(n_estimators=50, random_state=2)
# xgb=XGBClassifier(n_estimators=50, random_state=2)

In [None]:
# clfs={
#     'SVC': svc,
#     'KNC': knc,
#     'MNB': mnb,
#     'DTC': dtc,
#     'LRC': lrc,
#     'RFC': rfc,
#     'ADC': adc,
#     'BGC': bgc,
#     'ETC': etc,
#     'GBC': gbc,
#     'XGB': xgb
# }

In [None]:
# def train_classifier(clf, x_train, y_train, x_test, y_test):
#     clf.fit(x_train, y_train)
#     y_pred=clf.predict(x_test)
    
#     accuracy=accuracy_score(y_test, y_pred)
#     precision=precision_score(y_test, y_pred)
#     return accuracy, precision

In [None]:
# train_classifier(svc, x_tfidf_train, y_tfidf_train, x_tfidf_test, y_tfidf_test)

In [None]:
# acc_scores=[]
# prec_scores=[]

# for name, clf in clfs.items():
#     curr_acc, curr_prec=train_classifier(
#         clf,
#         x_tfidf_train, y_tfidf_train,
#         x_tfidf_test, y_tfidf_test
#     )

#     print(f'For {name} accuracy is {curr_acc} and precision is {curr_prec}')
#     acc_scores.append(curr_acc)
#     prec_scores.append(curr_prec)

In [None]:
# performance_df=pd.DataFrame({
#     'Algorithm': clfs.keys(),
#     'Accuracy': acc_scores,
#     'Precision': prec_scores
# })

In [None]:
# performance_df.sort_values(by='Accuracy', ascending=False)

In [None]:
# performance_df.sort_values(by='Precision', ascending=False)

In [None]:
# sns.catplot(
#     x='Algorithm',
#     y='value',
#     hue='variable',
#     data=performance_df.melt(id_vars='Algorithm'),
#     kind='bar',
#     height=5
# )

# plt.ylim(0.5, 1.0)
# plt.xticks(rotation='vertical')
# plt.show()

### Improving Model

#### Updating `max_features` in TfidfVectorizer

In [None]:
# tfidf=TfidfVectorizer(max_features=3000)

In [None]:
# x=tfidf.fit_transform(df['transformed_text']).toarray()

#### Applying `min_max_scaling` on x

In [None]:
# from sklearn.preprocessing import MinMaxScaler
# scaler=MinMaxScaler()
# x=scaler.fit_transform(x)

#### Appending `num_character` column to x

In [None]:
# x=np.hstack((x, df['num_characters'].values.reshape(-1, 1)))

### Voting Classifier

In [None]:
# from sklearn.ensemble import VotingClassifier

# svc=SVC(kernel='sigmoid', gamma=1.0, probability=True)
# mnb=MultinomialNB()
# etc=ExtraTreesClassifier(n_estimators=50, random_state=2)

In [None]:
# voting=VotingClassifier(
#     estimators=[('svc', svc), ('mnb', mnb), ('etc', etc)],
#     voting='soft'
# )

In [None]:
# voting.fit(x_tfidf_train, y_tfidf_train)

In [None]:
# y_pred=voting.predict(x_tfidf_test)
# print(accuracy_score(y_tfidf_test, y_pred))
# print(precision_score(y_tfidf_test, y_pred))

### Stacking Classifier

In [None]:
# final_estimator=RandomForestClassifier()
# estimators=[('svc', svc), ('mnb', mnb), ('etc', etc)]

In [None]:
# from sklearn.ensemble import StackingClassifier
# clf=StackingClassifier(
#     estimators=estimators,
#     final_estimator=final_estimator
# )

In [None]:
# clf.fit(x_tfidf_train, y_tfidf_train)
# y_pred=clf.predict(x_tfidf_test)

# print(accuracy_score(y_tfidf_test, y_pred))
# print(precision_score(y_tfidf_test, y_pred))

In [None]:
import pickle

pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
pickle.dump(mnb, open('model.pkl', 'wb'))