### Dataset Link : https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset/data

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Try different encodings if 'utf-8' fails
try:
    df = pd.read_csv("spam.csv", encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv("spam.csv", encoding='latin1')  # Try 'latin1' or other encodings


In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [14]:
# Split into features and labels
X = df["v2"]
y = df["v1"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: v2, Length: 5572, dtype: object

In [16]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: v1, Length: 5572, dtype: object

## TF-IDF

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)

X_test_tfidf = vectorizer.transform(X_test)


## Naive Bayes Model

In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Initialize and train the model
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Predict on test data
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate the model
print("Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.72      0.84       150

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       0.99      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



## SVM

In [21]:
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)

print("Support Vector Machine Classification Report:")
print(classification_report(y_test, y_pred_svm))


Support Vector Machine Classification Report:
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       1.00      0.87      0.93       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



## Predictions

In [32]:
#spam
spam_text_tfidf = vectorizer.transform(["Congratulations! You have won a $1000 cash prize. Call now to claim your prize"])

#ham
ham_text_tfidf = vectorizer.transform(["Thanks for the help today. I really appreciate it. See you at the meeting."])

### Naive Bayes

In [34]:
spam_prediction = nb_model.predict(spam_text_tfidf)
print(spam_prediction)
print("-------------------------------------------")
ham_prediction = nb_model.predict(ham_text_tfidf)
print(ham_prediction)

['spam']
-------------------------------------------
['ham']


### Logistic Regression

In [35]:
spam_prediction = lr_model.predict(spam_text_tfidf)
print(spam_prediction)
print("-------------------------------------------")
ham_prediction = lr_model.predict(ham_text_tfidf)
print(ham_prediction)

['spam']
-------------------------------------------
['ham']


### SVM

In [36]:
spam_prediction = svm_model.predict(spam_text_tfidf)
print(spam_prediction)
print("-------------------------------------------")
ham_prediction = svm_model.predict(ham_text_tfidf)
print(ham_prediction)

['spam']
-------------------------------------------
['ham']
