In [3]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [6]:
raw_mail_data = pd.read_csv('/content/mail_data.csv')

In [7]:
raw_mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
# replace missing values with null strinbs
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')

In [9]:
mail_data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
mail_data.shape

(5572, 2)

In [11]:
# Label encoding, spam mail = 0, ham mail = 1
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1

In [12]:
x = mail_data['Message']
y = mail_data['Category']

In [13]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [16]:
print(x_test.shape, x.shape, x_train.shape)

(1115,) (5572,) (4457,)


In [19]:
# features extraction
# transformation of text data into feature vectores that can be used as input to LogisticRegression model

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase = True)
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)
# convert y train and y test values in integers
y_train = y_train.astype('int')
y_test  = y_test.astype('int')

In [21]:
print(x_train_feature)

  (0, 4334)	0.42941702167641554
  (0, 3958)	0.6161071828926097
  (0, 6586)	0.44333254982109394
  (0, 6927)	0.48935591439341625
  (1, 2121)	0.3573617143022146
  (1, 1428)	0.5869421390016223
  (1, 6971)	0.42812434651556874
  (1, 3168)	0.5869421390016223
  (2, 5115)	0.3408491178137899
  (2, 7353)	0.31988118061968496
  (2, 3852)	0.3408491178137899
  (2, 4884)	0.35749230587184955
  (2, 5695)	0.35749230587184955
  (2, 806)	0.26730249393705324
  (2, 5894)	0.35749230587184955
  (2, 1876)	0.28751725124107325
  (2, 6878)	0.35749230587184955
  (3, 197)	0.36522237107066735
  (3, 3723)	0.16297045459835785
  (3, 2435)	0.26698378141852
  (3, 1825)	0.26858331513730566
  (3, 5231)	0.2266831802864503
  (3, 300)	0.2915969875465198
  (3, 7248)	0.23571908490908416
  (3, 5005)	0.3169028431039865
  :	:
  (4454, 2244)	0.2526916142542512
  (4454, 666)	0.28653660324238944
  (4454, 1575)	0.20946314330145205
  (4454, 1094)	0.24862733340971144
  (4454, 5068)	0.22284357632450164
  (4454, 311)	0.19547195974237946
  

In [22]:
model = LogisticRegression()

In [23]:
model.fit(x_train_feature, y_train)

In [24]:
prediction = model.predict(x_test_feature)

In [25]:
model_accuracy = accuracy_score(prediction, y_test)

In [26]:
model_accuracy

0.9524663677130045

In [27]:
# prompt: generate the code by which now user just input the data where input_data = (), and further model will predict , where user input the message or x part of the data and get output as y

import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

raw_mail_data = pd.read_csv('/content/mail_data.csv')
raw_mail_data.head()
# replace missing values with null strinbs
mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)),'')
mail_data.head()
mail_data.shape
# Label encoding, spam mail = 0, ham mail = 1
mail_data.loc[mail_data['Category'] == 'spam','Category',] = 0
mail_data.loc[mail_data['Category'] == 'ham','Category',] = 1
x = mail_data['Message']
y = mail_data['Category']
x
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)
print(x_test.shape, x.shape, x_train.shape)
# features extraction
# transformation of text data into feature vectores that can be used as input to LogisticRegression model

feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase = True)
x_train_feature = feature_extraction.fit_transform(x_train)
x_test_feature = feature_extraction.transform(x_test)
# convert y train and y test values in integers
y_train = y_train.astype('int')
y_test  = y_test.astype('int')
print(x_train_feature)
model = LogisticRegression()
model.fit(x_train_feature, y_train)
def model_prediction(input_data):
  feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase = True)
  input_data_feature = feature_extraction.transform(input_data)
  prediction = model.predict(input_data_feature)
  return prediction
