In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer  # for converting message to binary
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
data = pd.read_csv('mail_data.csv')

In [3]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# removing null with empty str
data1=data.where((pd.notnull(data)),"")

In [5]:
data1.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
# rows n colum
data1.shape

(5572, 2)

In [7]:
#coverting str to bits(spam with 0 and ham with 1)
data1.loc[data1['Category'] == 'spam', 'Category',] =0
data1.loc[data1['Category'] == 'ham', 'Category',] =1

In [8]:
data1

Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,0,This is the 2nd time we have tried 2 contact u...
5568,1,Will ü b going to esplanade fr home?
5569,1,"Pity, * was in mood for that. So...any other s..."
5570,1,The guy did some bitching but I acted like i'd...


In [9]:
# dividing data
x=data1['Message']
y=data1['Category']

In [10]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [11]:
y

0       1
1       1
2       0
3       1
4       1
       ..
5567    0
5568    1
5569    1
5570    1
5571    1
Name: Category, Length: 5572, dtype: object

In [32]:
data1.to_csv('finaldata')#for django

In [12]:
# training Model
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2, random_state=3)

In [13]:
x.shape


(5572,)

In [14]:
x_train.shape


(4457,)

In [15]:
x_test.shape

(1115,)

In [16]:
# converting text data into numeric value


In [17]:
feature_extraction=TfidfVectorizer(min_df=1, stop_words='english' , lowercase=True)
x_train_feature=feature_extraction.fit_transform(x_train)
x_test_feature=feature_extraction.transform(x_test)
y_train=y_train.astype('int')
y_test=y_test.astype('int')

In [18]:
print(x_train_feature)
print(x_test_feature)

  (0, 5413)	0.6198254967574347
  (0, 4456)	0.4168658090846482
  (0, 2224)	0.413103377943378
  (0, 3811)	0.34780165336891333
  (0, 2329)	0.38783870336935383
  (1, 4080)	0.18880584110891163
  (1, 3185)	0.29694482957694585
  (1, 3325)	0.31610586766078863
  (1, 2957)	0.3398297002864083
  (1, 2746)	0.3398297002864083
  (1, 918)	0.22871581159877646
  (1, 1839)	0.2784903590561455
  (1, 2758)	0.3226407885943799
  (1, 2956)	0.33036995955537024
  (1, 1991)	0.33036995955537024
  (1, 3046)	0.2503712792613518
  (1, 3811)	0.17419952275504033
  (2, 407)	0.509272536051008
  (2, 3156)	0.4107239318312698
  (2, 2404)	0.45287711070606745
  (2, 6601)	0.6056811524587518
  (3, 2870)	0.5864269879324768
  (3, 7414)	0.8100020912469564
  (4, 50)	0.23633754072626942
  (4, 5497)	0.15743785051118356
  :	:
  (4454, 4602)	0.2669765732445391
  (4454, 3142)	0.32014451677763156
  (4455, 2247)	0.37052851863170466
  (4455, 2469)	0.35441545511837946
  (4455, 5646)	0.33545678464631296
  (4455, 6810)	0.29731757715898277
  (4

In [19]:
#training Model using logistic regression
model=LogisticRegression()

In [20]:
model.fit(x_train_feature,y_train)

In [21]:
#Evaluate
predicting=model.predict(x_train_feature)
accuracy=accuracy_score(y_train, predicting)


In [22]:
accuracy

0.9676912721561588

In [23]:
predicting=model.predict(x_test_feature)
accuracy2=accuracy_score(y_test, predicting)

In [24]:
accuracy2

0.9668161434977578

In [25]:
#predicting system
input=["WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."]
binarydata=feature_extraction.transform(input)
print(model.predict(binarydata))





[0]


In [26]:
model


In [27]:
import joblib
joblib.dump(model, "Logistic")

['Logistic']

In [28]:
new=joblib.load("Logistic")
new.predict(binarydata)


array([0])

NameError: name 'feature_extraction' is not defined