# Importing Libraries

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

## Import DataSet

In [27]:
data=pd.read_csv('Data/spam.csv',encoding='latin-1')
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Preprocessing

In [28]:
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)


In [29]:
data['v2'] = data['v2'].str.lower()


## Train Test Split

In [33]:
X_train,X_test,y_train,y_test = train_test_split(data.v2,data.v1,train_size=0.8,random_state=0)

# Using Logistic Regression

In [50]:
# Feature extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train classifiers
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)

# Predictions
lr_predictions = lr_classifier.predict(X_test_tfidf)

# Evaluation
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_predictions))


Logistic Regression Accuracy: 0.9650224215246637
Logistic Regression Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       949
        spam       1.00      0.77      0.87       166

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



## Saving Trained Model 

In [51]:
# Save the trained model
import pickle

with open('Spam_SMS_Detection', 'wb') as file:
    pickle.dump(lr_classifier, file)

# Save the TF-IDF vectorizer
with open('TfidfVectorizer', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

In [52]:
import pickle

trained_model = lr_classifier

with open('Spam_SMS_Detection', 'wb') as file:
    pickle.dump(trained_model, file)

## Checking Model Performance

In [65]:
# Load the model and the vectorizer
model = pickle.load(open('Spam_SMS_Detection', 'rb'))
tfidf_vectorizer = pickle.load(open('TfidfVectorizer', 'rb'))

# New data
new_text = ["Congratulations! You've been selText WON to 44255 to prize."]

# Transform new data with the same vectorizer
new_text_tfidf = tfidf_vectorizer.transform(new_text)

# Make predictions
prediction = model.predict(new_text_tfidf)

print(prediction)

['spam']


# Model Score

In [67]:
# Evaluating Model prediction
print("Logistic Regression Accuracy:", accuracy_score(y_test, lr_predictions))


Logistic Regression Accuracy: 0.9650224215246637
