In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
#The dataset is loaded using pandas. The dataset consists of SMS messages.
df = pd.read_csv('/content/spam.csv', encoding='latin1')

In [None]:
print(df.columns)

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 4', 'Unnamed: 3'], inplace=True)

In [None]:
#Ensure that there are no missing values in the dataset that could affect the model training.
df.isnull().sum()

v1    0
v2    0
dtype: int64

In [None]:
print("Number of missing values in each column:")
print(df.isnull().sum())

Number of missing values in each column:
v1    0
v2    0
dtype: int64


In [None]:
lable=np.unique(df['v1'])
lable

array(['ham', 'spam'], dtype=object)

In [None]:
#Convert the labels ('ham' and 'spam') to numerical values (0 and 1) for the model to process.
label_mapping = {"ham": 0, "spam": 1}
df['v1'] = df['v1'].map(label_mapping)

In [None]:
x = df['v2']
y = df['v1']

In [None]:
df

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [None]:
print("x_train:")
print(x_train)

x_train:
708     To review and KEEP the fantastic Nokia N-Gage ...
4338                   Just got outta class gonna go gym.
5029    Is there coming friday is leave for pongal?do ...
4921    Hi Dear Call me its urgnt. I don't know whats ...
2592    My friend just got here and says he's upping h...
                              ...                        
3772    I came hostel. I m going to sleep. Plz call me...
5191                               Sorry, I'll call later
5226        Prabha..i'm soryda..realy..frm heart i'm sory
5390                           Nt joking seriously i told
860                   In work now. Going have in few min.
Name: v2, Length: 3900, dtype: object


In [None]:
#The data is split into training and testing sets using train_test_split from sklearn.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [None]:
print(y_train)

708     1
4338    0
5029    0
4921    0
2592    0
       ..
3772    0
5191    0
5226    0
5390    0
860     0
Name: v1, Length: 3900, dtype: int64


In [None]:
#This method transforms the text into a matrix of TF-IDF features, which reflects the importance of words in the documents.
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [None]:
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train_tfidf, y_train)

In [None]:
feature=TfidfVectorizer(max_df=1,stop_words='english',lowercase=True)
x_train_tdfit=feature.fit_transform(x_train)
x_test_tdfit=feature.transform(x_test)

In [None]:
# Logistic Regression model is trained on the TF-IDF features of the training data.
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

In [None]:
#The trained model is used to predict the labels of the test set.
model.fit(x_train_tdfit,y_train)
y_pred=model.predict(x_test_tdfit)

In [None]:
y_pred = logistic_regression.predict(x_test_tfidf)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

#The overall accuracy of the model is calculated.

print("Classification Report:")
print(classification_report(y_test, y_pred))

# Detailed performance metrics, including precision, recall, and F1-score for both classes, are generated.

Accuracy: 0.9527511961722488
Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.98      0.65      0.78       219

    accuracy                           0.95      1672
   macro avg       0.96      0.83      0.88      1672
weighted avg       0.95      0.95      0.95      1672

