In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('mail_data.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
...,...,...,...,...
5166,1518,ham,Subject: put the 10 on the ft\r\nthe transport...,0
5167,404,ham,Subject: 3 / 4 / 2000 and following noms\r\nhp...,0
5168,2933,ham,Subject: calpine daily gas nomination\r\n>\r\n...,0
5169,1409,ham,Subject: industrial worksheets for august 2000...,0


In [4]:
data = df.where((pd.notnull(df)),)

In [5]:
data.head(20)

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0
5,2949,ham,Subject: ehronline web address change\r\nthis ...,0
6,2793,ham,Subject: spring savings certificate - take 30 ...,0
7,4185,spam,Subject: looking for medication ? we ` re the ...,1
8,2641,ham,Subject: noms / actual flow for 2 / 26\r\nwe a...,0
9,1870,ham,"Subject: nominations for oct . 21 - 23 , 2000\...",0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [7]:
data.shape

(5171, 4)

In [8]:
data.loc[data['label']== 'spam','label',] = 0
data.loc[data['label']== 'ham','label',] = 1

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


In [10]:
X= data['text']
Y = data['label']

In [11]:
X


0       Subject: enron methanol ; meter # : 988291\r\n...
1       Subject: hpl nom for january 9 , 2001\r\n( see...
2       Subject: neon retreat\r\nho ho ho , we ' re ar...
3       Subject: photoshop , windows , office . cheap ...
4       Subject: re : indian springs\r\nthis deal is t...
                              ...                        
5166    Subject: put the 10 on the ft\r\nthe transport...
5167    Subject: 3 / 4 / 2000 and following noms\r\nhp...
5168    Subject: calpine daily gas nomination\r\n>\r\n...
5169    Subject: industrial worksheets for august 2000...
5170    Subject: important online banking alert\r\ndea...
Name: text, Length: 5171, dtype: object

In [12]:
Y

0       1
1       1
2       1
3       0
4       1
       ..
5166    1
5167    1
5168    1
5169    1
5170    0
Name: label, Length: 5171, dtype: object

In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.2, random_state = 3) 

In [14]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5171,)
(4136,)
(1035,)


In [15]:
print(Y.shape)
print(Y_train.shape)
print(Y_test.shape)

(5171,)
(4136,)
(1035,)


In [16]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = feature_extraction.fit_transform(X_train)
X_test_features = feature_extraction.transform(X_test)

Y_train = Y_train.astype('int')
Y_test = Y_test.astype('int')


In [17]:
print(X_train)

2209    Subject: hplc to wellhead\r\ndaren here is the...
2000    Subject: mobil chemical - hpl meter # 1256 - e...
5030    Subject: revised nom 5 / 5 - eastrans ; revise...
1376    Subject: re : exxon company , usa global # 960...
1564    Subject: your pharmacy nx\r\nwant a cheap pain...
                              ...                        
789     Subject: incr ' ease yo ' ur man ' hood by 4 -...
968     Subject: subscribers receive first notice on r...
1667    Subject: neon for march 28\r\nhere is the neon...
3321    Subject: re : first delivery - pure resources ...
1688    Subject: enhance your chest size\r\nemail is l...
Name: text, Length: 4136, dtype: object


In [18]:
print(X_train_features)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 270337 stored elements and shape (4136, 45076)>
  Coords	Values
  (0, 38946)	0.01685805739459834
  (0, 21882)	0.15822963156663866
  (0, 43279)	0.15279926161032253
  (0, 13656)	0.0442446131354994
  (0, 26040)	0.06237658334565303
  (0, 13860)	0.05933334362320758
  (0, 29166)	0.04619404683559966
  (0, 32538)	0.08215570069958765
  (0, 42165)	0.1301153463744466
  (0, 38303)	0.12444307809779734
  (0, 37119)	0.09423749412823572
  (0, 28593)	0.09000082107218554
  (0, 16156)	0.3284546097301849
  (0, 12663)	0.17173706521192092
  (0, 1179)	0.13387711316973605
  (0, 27743)	0.44166331773844575
  (0, 2645)	0.12444307809779734
  (0, 1177)	0.13387711316973605
  (0, 2844)	0.11843023142166303
  (0, 31384)	0.14556222812251965
  (0, 836)	0.14556222812251965
  (0, 3875)	0.14556222812251965
  (0, 16637)	0.24438399643390496
  (0, 19429)	0.14556222812251965
  (0, 517)	0.14556222812251965
  :	:
  (4135, 18858)	0.08197239345561386
  (4135, 41043)	0.0

In [23]:
model = LogisticRegression()

In [24]:
model.fit(X_train_features, Y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [21]:
pred = model.predict(X_train_features)
accu = accuracy_score(Y_train,pred)

In [22]:
print ("Accuracy :",accu)

Accuracy : 0.9968568665377177


In [25]:
test_data= model.predict(X_test_features)
accuracy=accuracy_score(Y_test,test_data)
print("Accuracy :",accuracy)

Accuracy : 0.9806763285024155


In [34]:
# here i am taking input to check the upcoming email is spam or not 
a='''
Kaggle
Hi BabaTillu-001,

At Kaggle, we believe in the power of community-driven evaluation. That's why we're thrilled to introduce Kaggle Benchmarks, our new platform making it easier for anyone to test and understand model capabilities.

To showcase the platform, we are unveiling the ICML 2025 AI Experts Benchmark crowdsourced at the ICML conference in Vancouver. We evaluated over 30 tasks that reflect the real-world interests of AI researchers and industry experts on leading models, including Gemini 2.5 Pro, Claude 4 Sonnet, o4 mini, and DeepSeek-R1.

Kaggle Benchmarks Platform with metrics, leaderboards, and customization options

Want to create your own evals to test models at no cost or infrastructure setup? Join the waitlist for early access to custom task creation! Plus, check out kaggle.com/benchmarks featuring 70+ leaderboards like SciCode, GPQA, and Meta’s new MultiLoKo benchmark.'''
input_mail=["a"]
input_data= feature_extraction.transform(input_mail)
ppre= model.predict(input_data)
print(ppre)


if (ppre[0]==1):
    print("Ham mail")
else:
    print("spam mail")


[0]
spam mail
