## Importing the libarires

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [2]:
data = pd.read_csv('mail_data.csv')

## Getting some informations about the dataset

In [3]:
data.shape

(5572, 2)

In [4]:
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data.describe()

Unnamed: 0,Category,Message
count,5572,5572
unique,2,5157
top,ham,"Sorry, I'll call later"
freq,4825,30


## Getting the independent and the dependent variable

In [7]:
X = data[['Message']].values
y = data['Category'].values

In [8]:
print(X.shape)
print(y.shape)

(5572, 1)
(5572,)


## Applying the label encoding to the dependent variable 

In [9]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [10]:
print(y)

[0 0 1 ... 0 0 0]


## Splitting the dataset into the training set and the test set

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 1)
(1115, 1)
(4457,)
(1115,)


## Applying TfidfVectorizer to the independent variable

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=1, stop_words='english')
X_train = vectorizer.fit_transform([doc1.lower() for doc in X_train for doc1 in doc])
X_test = vectorizer.transform([doc1.lower() for doc in X_test for doc1 in doc])

In [14]:
print(X_train.shape)
print(X_test.shape)

(4457, 4021)
(1115, 4021)


## Training the logistic regression model on the training set

In [15]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train,y_train)

## Evaluating the model on the training set

In [16]:
from sklearn.metrics import accuracy_score
training_score = accuracy_score(y_train, classifier.predict(X_train))
print("The training score: ", training_score)

The training score:  0.8671752299753197


## Evaluating the model on the test set

In [17]:
test_score = accuracy_score(y_test, classifier.predict(X_test))
print("The test score: ", test_score)

The test score:  0.8609865470852018
