In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report,f1_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier


In [20]:
Data=pd.read_csv('spam.csv',encoding='latin-1')

In [21]:
Data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [22]:
#Dropping uninformative columns
Data.drop(Data.columns[Data.columns.str.contains('Unnamed', case=False)], axis=1, inplace=True)

In [23]:
Data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [24]:
#Renaming the columns for clarification
Data.columns=['label','text']

In [25]:
Data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [26]:
#Encoding the column label spam =1, ham=0
label_encoder=LabelEncoder()
Data['label']=label_encoder.fit_transform(Data['label'])
Data.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [27]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   5572 non-null   int64 
 1   text    5572 non-null   object
dtypes: int64(1), object(1)
memory usage: 87.2+ KB


In [28]:
#Splitting the data
X_train, X_test, y_train, y_test = train_test_split(Data['text'], Data['label'], test_size=0.2, random_state=50)

In [29]:
#Transforming the raw text data into a numerical representation
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


### **Logistic Regression Model**

In [30]:
LR_model=LogisticRegression()
LR_model.fit(X_train_tfidf,y_train)


In [31]:
LR_prediction = LR_model.predict(X_test_tfidf)

In [32]:
accuracy = accuracy_score(y_test, LR_prediction)
print("Accuracy for Logistic Regression:", accuracy)
f1 = f1_score(y_test, LR_prediction)
print("F1-score for Logistic Regression:", f1)

Accuracy for Logistic Regression: 0.9551569506726457
F1-score for Logistic Regression: 0.8175182481751825


In [33]:
print("Classification Report for Logistic Regression:")
print(classification_report(y_test, LR_prediction))

Classification Report for Logistic Regression:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       954
           1       0.99      0.70      0.82       161

    accuracy                           0.96      1115
   macro avg       0.97      0.85      0.90      1115
weighted avg       0.96      0.96      0.95      1115



### **Naive Bayes Model**

In [34]:
#converting the TF-vectors to dense data so we can apply GaussianNB
X_train_dense = X_train_tfidf.toarray()
X_test_dense=X_test_tfidf.toarray()

In [35]:
NB_model=GaussianNB()
NB_model.fit(X_train_dense,y_train)

In [36]:
NB_predict=NB_model.predict(X_test_dense)

In [37]:
accuracy = accuracy_score(y_test, NB_predict)
print("Accuracy for Naive Bayes:", accuracy)
f1 = f1_score(y_test, NB_predict)
print("F1-score for Naive Bayes:", f1)

Accuracy for Naive Bayes: 0.895067264573991
F1-score for Naive Bayes: 0.7022900763358779


In [38]:
print("Classification Report for Naive Bayes:")
print(classification_report(y_test, NB_predict))

Classification Report for Naive Bayes:
              precision    recall  f1-score   support

           0       0.97      0.90      0.94       954
           1       0.59      0.86      0.70       161

    accuracy                           0.90      1115
   macro avg       0.78      0.88      0.82      1115
weighted avg       0.92      0.90      0.90      1115



###Gradient Boost Model


In [39]:
GB_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
GB_model.fit(X_train_tfidf, y_train)
GB_predict = GB_model.predict(X_test_tfidf)

In [40]:
GB_accuracy = accuracy_score(y_test, GB_predict)
GB_f1 = f1_score(y_test, GB_predict, average='weighted')

In [41]:
print("Gradient Boosting - Accuracy:", GB_accuracy, "F1-score:", GB_f1)


Gradient Boosting - Accuracy: 0.95695067264574 F1-score: 0.9537081412494806
