In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE  


In [3]:
df = pd.read_csv('spam.csv', encoding='latin1')


In [4]:
df.rename(columns={'v1': 'Category', 'v2': 'Message'}, inplace=True)
print(df)


     Category                                            Message Unnamed: 2  \
0         ham  Go until jurong point, crazy.. Available only ...        NaN   
1         ham                      Ok lar... Joking wif u oni...        NaN   
2        spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3         ham  U dun say so early hor... U c already then say...        NaN   
4         ham  Nah I don't think he goes to usf, he lives aro...        NaN   
...       ...                                                ...        ...   
5567     spam  This is the 2nd time we have tried 2 contact u...        NaN   
5568      ham              Will Ì_ b going to esplanade fr home?        NaN   
5569      ham  Pity, * was in mood for that. So...any other s...        NaN   
5570      ham  The guy did some bitching but I acted like i'd...        NaN   
5571      ham                         Rofl. Its true to its name        NaN   

     Unnamed: 3 Unnamed: 4  
0           NaN       

In [5]:
data = df.where(pd.notnull(df), '')


In [6]:
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1


In [7]:
print(data)

     Category                                            Message Unnamed: 2  \
0           1  Go until jurong point, crazy.. Available only ...              
1           1                      Ok lar... Joking wif u oni...              
2           0  Free entry in 2 a wkly comp to win FA Cup fina...              
3           1  U dun say so early hor... U c already then say...              
4           1  Nah I don't think he goes to usf, he lives aro...              
...       ...                                                ...        ...   
5567        0  This is the 2nd time we have tried 2 contact u...              
5568        1              Will Ì_ b going to esplanade fr home?              
5569        1  Pity, * was in mood for that. So...any other s...              
5570        1  The guy did some bitching but I acted like i'd...              
5571        1                         Rofl. Its true to its name              

     Unnamed: 3 Unnamed: 4  
0                     

In [None]:
X = data['Message']
Y = data['Category']


In [None]:
print(X)


In [None]:
print(Y)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)


In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)

In [None]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

In [None]:
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [None]:
smote = SMOTE(random_state=42)  # Initialize SMOTE
X_train_resampled, Y_train_resampled = smote.fit_resample(X_train_features, Y_train)

In [None]:
model = LogisticRegression()
model.fit(X_train_resampled, Y_train_resampled)

In [None]:
prediction_on_training_data = model.predict(X_train_resampled)
accuracy_on_training_data = accuracy_score(Y_train_resampled, prediction_on_training_data)

In [None]:
prediction_on_test_data = model.predict(X_test_features)
accuracy_on_test_data = accuracy_score(Y_test, prediction_on_test_data)

In [None]:
print(f"Accuracy on training data: {accuracy_on_training_data * 100:.2f}%")
print(f"Accuracy on test data: {accuracy_on_test_data * 100:.2f}%")

In [None]:
input_your_mail = ["Congratulations! You've won a prize! Click here to claim your $1000 gift card now: bit.ly/freereward"]
input_data_features = feature_extraction.transform(input_your_mail)

In [None]:
prediction = model.predict(input_data_features)

print(prediction)

if prediction[0] == 1:
    print('Ham mail')
else:
    print('Spam mail')