<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Practices/blob/main/Spam_SMS_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies and Setup

<div style="direction:rtl">
<font color='green' size="5px">
 کتابخانه های مورد نیاز را نصب میکنیم
    </font>
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
from collections import Counter
import re


from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import EarlyStopping

from keras.layers import Input, Embedding, Activation, Flatten, Dense
from keras.layers import Conv1D, MaxPooling1D, Dropout, LSTM
from keras.models import Model
from keras import metrics
from tensorflow.keras.utils import to_categorical

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import warnings

warnings.filterwarnings('ignore')

<div style="direction:rtl">
<font color='green' size="5px">
    از لینک زیر دیتاست را دانلود کرده و در پوشه هم مسیر همین ژوپیتر نوت بوک قرار دهید یا خط فرمان زیر را اجرا نمایید
    </font>
</div>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mariumfaheem666/spam-sms-classification-using-nlp")

print("Path to dataset files:", path)

## https://www.kaggle.com/datasets/omidsakaki1370/euro-and-dollar-currency

# 2. Load Data

<div style="direction:rtl">
<font color='green' size="5px">
توسط خط فرمان زیر، دیتا را فراخوانی میکنیم
    </font>
</div>

In [None]:
df = pd.read_csv('/kaggle/input/spam-sms-classification-using-nlp/Spam_SMS.csv')
df.head()

# 3.Data preparation

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
def clean_str(string):

    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

df['Message'] = [clean_str(Message) for Message in df['Message']]

In [None]:
X = df['Message']
Y = df['Class']
print('Number of Dataset sentence' , X.shape)
print('Number of Dataset labels' , Y.shape)

In [None]:
cnt = Counter(Y)
cnt = dict(cnt)
print(cnt)

sns.countplot(x='Class', data=df);

In [None]:
labels = list(cnt.keys())
sizes = list(cnt.values())
colors = ['#3fba36', '#66b3ff','#ffcc99','#ff9999', '#d44444']
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, colors=colors,
        autopct='%1.1f%%', startangle=90)
#draw circle
centre_circle = plt.Circle((0,0),0.70,fc='white')
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
ax1.axis('equal')
plt.tight_layout()
plt.show()

In [None]:
max_fatures = 2000

tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['Message'].values)

X = tokenizer.texts_to_sequences(df['Message'].values)

X = pad_sequences(X,maxlen=100)

In [None]:
le = LabelEncoder()
Y = le.fit_transform(df['Class'])

In [None]:
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):


    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

In [None]:
results = {}

# 4. Split & Scale Data

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,Y, test_size=0.2)

# 5.ML models

## 5.1.XGB

In [None]:
XGB = XGBClassifier()
XGB.fit(X_train,y_train)
prediction=XGB.predict(X_test)

results['XGB'] = {
        'Accuracy': accuracy_score(y_test, prediction),
    }

cm = confusion_matrix(prediction,y_test)
print(cm)
plot_confusion_matrix(cm,
                          target_names=['ham', 'spam'],
                          title='XGB Classifier',
                          cmap=None,
                          normalize=True)
sk_report = classification_report(
    digits=2,
    y_true=prediction,
    y_pred=y_test)
print(sk_report)

## 5.2.LGBM

In [None]:
LGBM = LGBMClassifier()
LGBM.fit(X_train,y_train)
prediction=LGBM.predict(X_test)

results['LGBM'] = {
        'Accuracy': accuracy_score(y_test, prediction),
    }

cm = confusion_matrix(prediction,y_test)
print(cm)
plot_confusion_matrix(cm,
                          target_names=['ham', 'spam'],
                          title='LGBM Classifier',
                          cmap=None,
                          normalize=True)
sk_report = classification_report(
    digits=2,
    y_true=prediction,
    y_pred=y_test)
print(sk_report)

# 6.Deep Model

## 6.1. Model Architecture

In [None]:
def char_cnn_model(text, labels, num_epochs):

    tk = Tokenizer(lower=True, char_level=True, oov_token='UNK')
    tk.fit_on_texts(text)
    sequences = tk.texts_to_sequences(text)

    data = pad_sequences(sequences, maxlen=input_size)
    labels = to_categorical(labels)

    vocab_size = len(tk.word_index)

    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size = 0.1, random_state = 42)

    #creating embedding matrix
    embedding_weights = []
    embedding_weights.append(np.zeros(vocab_size))

    for char, i in tk.word_index.items():
        onehot = np.zeros(vocab_size)
        onehot[i - 1] = 1
        embedding_weights.append(onehot)

    embedding_weights = np.array(embedding_weights)

    embedding_layer = Embedding(vocab_size + 1, vocab_size, input_length=input_size, weights=[embedding_weights])

    #Model architecture
    inputs = Input(shape=(input_size,), name='input', dtype='int64')
    x = embedding_layer(inputs)

    for filter_num, filter_size, pooling_size in conv_layers:
        x = Conv1D(filter_num, filter_size)(x)
        x = Dropout(dropout_p)(x)
        x = Activation('relu')(x)
        if pooling_size != -1:
            x = MaxPooling1D(pool_size=pooling_size)(x)
    x = Flatten()(x)

    for dense_size in fully_connected_layers:
        x = Dense(dense_size, activation='relu')(x)
        x = Dropout(dropout_p)(x)

    predictions = Dense(num_of_classes, activation='softmax')(x)

    # Build model
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer=optimizer, loss=loss_type, metrics=['accuracy'])
    hist=model.fit(x_train, y_train, epochs=num_epochs,validation_data=(x_test,y_test), batch_size=100)

    #loss, accuracy = model.evaluate(x_test, y_test, verbose=0)

    return x_train, x_test, y_train, y_test,hist,model

## 6.2.Train Model

In [None]:
le = LabelEncoder()
labels = le.fit_transform(df['Class'])

In [None]:
input_size = 100
conv_layers = [[256, 3, 3]]

fully_connected_layers = [128]
num_of_classes = 2
dropout_p = 0.5
optimizer = 'adam'
loss_type = 'categorical_crossentropy'

In [None]:
x_train, x_test, y_train, y_test,hist,CNN_model = char_cnn_model(df['Message'], labels, num_epochs=25)

In [None]:
prediction=CNN_model.predict(x_test)

results['CNN'] = {
        'Accuracy': accuracy_score(np.argmax(y_test,axis=-1), np.argmax(prediction,axis=-1)),
    }

cm = confusion_matrix(np.argmax(prediction,axis=-1),np.argmax(y_test,axis=-1))
print(cm)

plot_confusion_matrix(cm,
                          target_names=['ham', 'spam'],
                          title='CNN',
                          cmap=None,
                          normalize=True)

sk_report = classification_report(np.argmax(prediction,axis=-1),np.argmax(y_test,axis=-1))
print(sk_report)

In [None]:
cnn=hist
plt.figure(0)
plt.plot(cnn.history['accuracy'],'r')
plt.plot(cnn.history['val_accuracy'],'g')
plt.xticks(np.arange(0, 11, 2.0))
plt.rcParams['figure.figsize'] = (8, 6)
plt.xlabel("Num of Epochs")
plt.ylabel("Accuracy")
plt.title("Training Accuracy vs Validation Accuracy")
plt.legend(['train','validation'])

plt.figure(1)
plt.plot(cnn.history['loss'],'r')
plt.plot(cnn.history['val_loss'],'g')
plt.xticks(np.arange(0, 11, 2.0))
plt.rcParams['figure.figsize'] = (8, 6)
plt.xlabel("Num of Epochs")
plt.ylabel("Loss")
plt.title("Training Loss vs Validation Loss")
plt.legend(['train','validation'])

plt.show()

# 7.Result

In [None]:
results_df = pd.DataFrame(results).T
results_df

In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(results_df);