# Installation

In [1]:
! pip install sentence-transformers



## Importing Libraries

In [2]:
import pandas as pd
import numpy as np

## Creating model instance

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')


In [4]:
data = pd.read_csv("spam.csv",encoding='iso-8859-1')


In [5]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data[['v1','v2']]

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
data = data[['v1','v2']]

## Exploratory Data Analysis


In [8]:
data['v1'].value_counts()

ham     4825
spam     747
Name: v1, dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      5572 non-null   object
 1   v2      5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [10]:
data.describe()

Unnamed: 0,v1,v2
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [11]:
data['length'] = data['v2'].apply(len)
data.head()

Unnamed: 0,v1,v2,length
0,ham,"Go until jurong point, crazy.. Available only ...",111
1,ham,Ok lar... Joking wif u oni...,29
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,ham,U dun say so early hor... U c already then say...,49
4,ham,"Nah I don't think he goes to usf, he lives aro...",61


In [12]:
data = data[['v1','v2']]
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
data['v1'] = data['v1'].replace({'ham': 0, 'spam': 1})

In [14]:
data.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing

In [15]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)


2023-09-26 11:06:26.185804: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 11:06:26.358025: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-09-26 11:06:26.359140: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'get', 'well', 'between', 'his', 'now', 'during', 'hereupon', 'towards', 'show', 'almost', 'yourselves', 'where', 'should', 'ca', 'become', 'some', 'forty', 'so', '‘d', 'above', 'doing', 'keep', 'first', 'others', 'would', 'everyone', 'full', 'the', '’ve', 'again', 'elsewhere', 'and', 'anywhere', 'anything', 'across', 'mostly', 'can', 'really', 'give', 'beside', 'hereafter', 'must', 'six', 'whether', 'except', 'what', 'their', 'another', 'serious', 'eleven', 'beyond', 'nine', 'while', 'any', 'becomes', 'front', 'per', 'were', 'but', 'take', 'fifteen', 'why', 'thereupon', 'although', 'call', 'besides', 'anyone', 'say', 'than', 'made', 'i', 'more', 'itself', 'mine', 'two', 'perhaps', 'was', 're', 'else', 'through', 'various', 'otherwise', 'somehow', 'over', 'about', 'someone', 'thus', 'therefore', 'whole', 'n‘t', 'most', "'s", 'hundred', 'all', 'name', 'within', 'meanwhile', 'does', 'may', 'still', '’d', 'often', 'when', 'nor', 'me', 'him', '’re', 'ours', 'since', 'whoever', 'third', 'o

In [16]:
punctuations = string.punctuation
print(punctuations)


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [17]:
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    sentence = " ".join(mytokens)
    # return preprocessed list of tokens
    return sentence
     


In [18]:
spacy_tokenizer("I love apple")

'love apple'

In [19]:
data['tokenize'] = data['v2'].apply(spacy_tokenizer)


In [20]:
data.head()

Unnamed: 0,v1,v2,tokenize
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy .. available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun early hor ... u c ...
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live


In [21]:
data['embeddings'] = data['tokenize'].apply(model.encode)

In [22]:
data.head()

Unnamed: 0,v1,v2,tokenize,embeddings
0,0,"Go until jurong point, crazy.. Available only ...",jurong point crazy .. available bugis n great ...,"[-0.031411782, -0.02397296, 0.06677125, -0.047..."
1,0,Ok lar... Joking wif u oni...,ok lar ... joke wif u oni ...,"[-0.066601254, -0.010990367, -0.005665734, -0...."
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...,"[-0.02153208, 0.056793112, 0.01019386, -0.0419..."
3,0,U dun say so early hor... U c already then say...,u dun early hor ... u c ...,"[-0.0233709, 0.027283063, 0.018737765, -0.0214..."
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think usf live,"[0.024279743, -0.14003043, -0.013055384, -0.04..."


In [23]:
X = data['embeddings'].to_list()
y = data['v1'].to_list()

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

In [25]:
from sklearn.linear_model import LogisticRegression
log_regres = LogisticRegression()
log_regres.fit(X_train,y_train)

In [26]:
from sklearn import metrics
predicted = log_regres.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))


Logistic Regression Accuracy: 0.9856502242152466
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.8926174496644296


## Saving model

In [27]:
import joblib

model_filename = 'email_classification.pkl'
joblib.dump(log_regres, model_filename)

print(f"Model saved as {model_filename}")


Model saved as email_classification.pkl
