# Step 01 Importing Libraries

In [44]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import seaborn as sns

# Step 02 Data Collection & Pre-Processing

In [7]:
# loading the data from csv file
df = pd.read_csv("/content/mail_data (1).csv")

In [8]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
df


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [10]:
# Now, we have to replace all the missing/null values with a null string

df = df.where((pd.notnull(df)) , "")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
# Find the sum of the missing values
missing_values = df.isnull().sum
missing_values()

Category    0
Message     0
dtype: int64

In [16]:
# checking the rows and columns in the data frame
df.shape

(5572, 2)

# Label Encoding

In [19]:
# label encoding is basically the procedure of converting text data into binary ( 0 or 1) form

# Assuming 'Category' is the column you want to encode
# df.loc[df['Category'] == 'spam', 'Category'] = 0

df.loc[df['Category'] == 'spam' , 'Category'] = 0
df.loc[df['Category'] == 'ham' , 'Category'] = 1



spam - 0

ham - 1

In [21]:
# separating the data as texts and labels
X = df["Message"]

Y = df["Category"]



In [23]:
print(X) # 'X' has all the messages now

print(Y) # 'Y' has all the labels

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object
0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object


# Splitting the data into training and testing data

In [24]:
X_train , X_test , Y_train , Y_test = train_test_split(X,Y , test_size=0.2 ,random_state=2 )


(5572,)


In [29]:
print(X.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(5572,)
(4457,)
(4457,)
(1115,)
(1115,)


## Feature Extraction

In [73]:
feature_extraction = TfidfVectorizer(min_df = 1 , stop_words='english' , lowercase=True)
# print(feature_extraction )

In [74]:
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

# convert Y_train and Y_test values as integers
Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')

In [75]:
print(X_train_features)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

# Model Training

Logistic Regression Use

In [76]:
model = LogisticRegression()

In [77]:
# Training the logistic regression model with training data
model.fit(X_train_features, Y_train)

Evaluating the train model

In [78]:
# prediction on training data

prediction_on_training_data = model.predict(X_train_features)
accuracy_on_training_data = accuracy_score(Y_train,prediction_on_training_data)


In [79]:
print("Accuracy On the training data = ", round(accuracy_on_training_data*100 , 2))

Accuracy On the training data =  96.84


In [80]:
# prediction on testing data

prediction_on_test_data = model.predict(X_test_features)
accuracy_on_testing_data = accuracy_score(Y_test , prediction_on_test_data)


In [81]:
print("accuracy on the testing data = " , round(accuracy_on_testing_data*100 , 2))

accuracy on the testing data =  95.25


Building a predictive system

In [89]:
input_mail = ['''Exciting News! You are selected for virtual Internship! 🤩🚀
We are thrilled to announce that you have been successfully get internship for React.JS internship position.🤩

Your Email is your password
Visit Our task management system website : http://interneepk-portal.great-site.net/.

By completing virtual internships through Internee.pk, you'll gain a competitive edge in the job market. Employers value practical experience, and our program equips you with the necessary skills to launch a successful career in computer science. The knowledge, guidance, and mentorship you receive from industry experts will be invaluable in your professional journey.📈

Must follow us on social media platfroms:

Instagram : https://www.instagram.com/internee.pk/
Facebook : https://www.facebook.com/profile.php?id=100093222249320
Linkedin : https://www.linkedin.com/groups/9384003/

Your Joining Letter is mentioned below. Must share it on instagram, Linkedin :)''']

input_data_features = feature_extraction.transform(input_mail)

# making predictions
prediction = model.predict(input_data_features)
print(prediction)

# for i in range(0,):
if prediction[0] == 0:
  print("Spam Mail")
else:
  print("Ham Mail")

[1]
Ham Mail
