In [192]:
#Importing libraries
import pandas as pd

In [193]:
# Load the data
data = pd.read_csv("spam_data.csv", encoding="latin1")

In [194]:
# Getting info about data
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


# ***Data Cleaning***

In [195]:
#Removing unwanted features
data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)

In [196]:
# Renaming the features 
data.rename(columns={'v1':'Category','v2':'Messages'},inplace=True)

#### *Checking for null values*

In [197]:
data.isnull().sum()

Category    0
Messages    0
dtype: int64

In [198]:
data.Category.value_counts()

Category
ham     4825
spam     747
Name: count, dtype: int64

*Converting Message category into binary format: Spam->1 and Not-Spam->0*


In [199]:
data['Category'] = data['Category'].map({'spam': 1, 'ham': 0})

In [200]:
data

Unnamed: 0,Category,Messages
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


# ***Text Preprocessing***

In [201]:
# Importing preprocessing libraries
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [202]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [203]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Tokenize the text
    words = nltk.word_tokenize(text)

    # Remove stopwords and perform stemming and lemmatization
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stopwords.words('english')]

    # Join words back to text
    text = ' '.join(words)
    
    return text

In [204]:
data['Messages'] = data['Messages'].apply(preprocess_text)

In [205]:
data

Unnamed: 0,Category,Messages
0,0,"go jurong point , crazi .. avail bugi n great ..."
1,0,ok lar ... joke wif u oni ...
2,1,free entri 2 wkli comp win fa cup final tkt 21...
3,0,u dun say earli hor ... u c alreadi say ...
4,0,"nah n't think goe usf , live around though"
...,...,...
5567,1,2nd time tri 2 contact u. u å£750 pound prize ...
5568,0,ì_ b go esplanad fr home ?
5569,0,"piti , * mood . ... suggest ?"
5570,0,guy bitch act like 'd interest buy someth el n...


In [206]:
x = data["Messages"]
y = data["Category"]

### *Splitting the data*


In [207]:
from sklearn.model_selection import train_test_split

In [208]:
# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=42, test_size=0.20)

## ***Vecotrization***

In [209]:
# Creating TF-IDF vectors for the messages
vectorizer = TfidfVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)


# ***Model Training***

In [210]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,classification_report
from sklearn.naive_bayes import MultinomialNB

### ***Logistic Regression***

In [211]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

lr_pred = lr.predict(x_test)
print(f"LR Accuracy: {accuracy_score(y_test,lr_pred)}")
print(classification_report(y_test, lr_pred))

LR Accuracy: 0.95695067264574
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       0.96      0.71      0.82       150

    accuracy                           0.96      1115
   macro avg       0.96      0.85      0.90      1115
weighted avg       0.96      0.96      0.95      1115



### ***Support Vector***

In [212]:
svc = SVC()
svc.fit(x_train,y_train)

svc_pred = svc.predict(x_test)
print(f"svc Accuracy: {accuracy_score(y_test,svc_pred)}")
print(classification_report(y_test, svc_pred))

svc Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### ***Naive Bayes***

In [213]:
nb = MultinomialNB()
nb.fit(x_train,y_train)

nb_pred = nb.predict(x_test)
print(f"nb Accuracy: {accuracy_score(y_test,nb_pred)}")
print(classification_report(y_test, nb_pred))

nb Accuracy: 0.967713004484305
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.76      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.97      1115



### ***Random Forest***

In [214]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)

rf_pred = rf.predict(x_test)
print(f"rf Accuracy: {accuracy_score(y_test,rf_pred)}")
print(classification_report(y_test, rf_pred))

rf Accuracy: 0.9802690582959641
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115



### ***Decision Tree***

In [215]:
dt = DecisionTreeClassifier()
dt.fit(x_train,y_train)

dt_pred = dt.predict(x_test)
print(f"dt Accuracy: {accuracy_score(y_test,dt_pred)}")
print(classification_report(y_test, dt_pred))

dt Accuracy: 0.9650224215246637
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.90      0.83      0.87       150

    accuracy                           0.97      1115
   macro avg       0.94      0.91      0.92      1115
weighted avg       0.96      0.97      0.96      1115



### *Summary of the Models Trained*

In [216]:
summary = pd.DataFrame({'Models':['LR','NB','RF','DT','SVC'],
                        'Accuracy':[accuracy_score(y_test,lr_pred),
                                    accuracy_score(y_test,nb_pred),
                                    accuracy_score(y_test,rf_pred),
                                    accuracy_score(y_test,dt_pred),
                                    accuracy_score(y_test,svc_pred)]})

In [217]:
summary

Unnamed: 0,Models,Accuracy
0,LR,0.956951
1,NB,0.967713
2,RF,0.980269
3,DT,0.965022
4,SVC,0.980269


*Conclusion/Result: The top model from LR NB and SVC after training is SVC with 98% accuracy*

# ***Prediction***

In [218]:
new_message = ["XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]

# Preprocess the message using the same steps as your training data
new_message = preprocess_text(new_message[0])  # Assuming 'preprocess_text' is your text preprocessing function

# Vectorize the message
X_new = vectorizer.transform([new_message])

# Use the model to predict the category of the new message
predicted = svc.predict(X_new)

# Print the prediction
print("Predicted category:", predicted[0])

Predicted category: 1
