In [3]:
import pandas as pd
import re
import numpy as np
import joblib
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report, accuracy_score

In [4]:
data = pd.read_csv("spam.csv", encoding = 'latin-1')[['v1', 'v2']]
data.columns = ['label', 'text']

In [1]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


True

Preprocessing

In [6]:
lemmatizer = WordNetLemmatizer()

def preprocess(text) :
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    return tokens

data['tokens'] = data['text'].apply(preprocess)

Training Word2Vec

In [7]:
sentences = data['tokens'].tolist()

text_model = Word2Vec(
    sentences= data['tokens'],
    vector_size=100,
    window=5,
    min_count=1,
    workers=5
)

text_model.save('wv.model')
print('Model Saved and Trained')

print(list(text_model.wv.index_to_key))

Model Saved and Trained


Convert the list of words into vectors

In [8]:
loaded_model = Word2Vec.load('wv.model')

def get_vector(tokens) :
    vectors = [loaded_model.wv[t] for t in tokens if t in loaded_model.wv]
    if (vectors) :
        return np.mean(vectors, axis = 0) 
    else :
        return np.zeros(100)

data['vector'] = data['tokens'].apply(get_vector)
X = np.vstack(data['vector'].values)
y = data['label'].map({'ham' : 0, 'spam' : 1})

Training using DT Classifier

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


Predict Model

In [10]:
y_pred = clf.predict(X_test)

print("accuracy_score: ", accuracy_score(y_test, y_pred))
print("classification_report: ", classification_report(y_test, y_pred))

accuracy_score:  0.947085201793722
classification_report:                precision    recall  f1-score   support

           0       0.97      0.97      0.97       965
           1       0.81      0.80      0.80       150

    accuracy                           0.95      1115
   macro avg       0.89      0.88      0.89      1115
weighted avg       0.95      0.95      0.95      1115



Save the Dumps using Joblib

In [11]:
joblib.dump(clf, "model1")
joblib.dump(text_model, "model2")
print("Models Saved :)")

Models Saved :)


In [12]:
X

array([[-0.09963243,  0.20763028,  0.00847512, ..., -0.22023606,
         0.05990168, -0.02045406],
       [-0.10171638,  0.21404213,  0.00755512, ..., -0.22129525,
         0.05714406, -0.01712266],
       [-0.09369152,  0.19573395,  0.00761401, ..., -0.20834354,
         0.04742436, -0.01411299],
       ...,
       [-0.00751105,  0.01751126, -0.00082241, ..., -0.01591813,
         0.00791299, -0.00153825],
       [-0.10159095,  0.21511663,  0.0091643 , ..., -0.22577582,
         0.05360256, -0.01796653],
       [-0.0668499 ,  0.13396965,  0.00803739, ..., -0.1462663 ,
         0.03822836, -0.0129487 ]])