## CONSTANT

In [1]:
DATA_PATH = './Data/mail_data.csv'

# Import Important Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load Data 

In [3]:
# load data by using pandas

spamData = pd.read_csv(DATA_PATH)
spamData.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# check your if there is any null in your data
spamData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5572 non-null   object
 1   Message   5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [7]:
# split data to dependant and not dependant variable
X = spamData['Message']
y = spamData['Category']


#### Apply Label Enconding For Target Varible

In [8]:
# apply label encoding to Category
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

array([0, 0, 1, ..., 0, 0, 0])

##### Split Data To Tranin set and Testing Set

In [9]:
# Split Data To Tranin set and Testing Set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Feature Extraction

In [11]:
# transform the text data to numerical ones
from sklearn.feature_extraction.text import TfidfVectorizer

fe = TfidfVectorizer(stop_words='english')
xnew = fe.fit_transform(X_train)
xtest = fe.transform(X_test)
print(xnew)

  (0, 5818)	0.22682143517864364
  (0, 2497)	0.2442158912653505
  (0, 694)	0.3171299579602537
  (0, 6264)	0.1898892037332199
  (0, 5800)	0.17558937755823417
  (0, 3262)	0.33791755486732394
  (0, 2049)	0.3034375179183143
  (0, 7300)	0.24288153842988894
  (0, 2724)	0.3544175987866074
  (0, 354)	0.3544175987866074
  (0, 7162)	0.2550284465664535
  (0, 258)	0.2379428657041507
  (0, 7222)	0.2173884735352799
  (0, 5512)	0.1898892037332199
  (1, 2555)	0.3840709491751004
  (1, 3804)	0.1902902346515268
  (1, 3932)	0.24325511357721427
  (1, 4509)	0.4028245991060671
  (1, 2440)	0.33870544648398715
  (1, 3333)	0.20665394084233096
  (1, 5650)	0.360444144470318
  (1, 2335)	0.2162321275166079
  (1, 6738)	0.28986069568918
  (1, 6109)	0.3239762634465801
  (1, 3267)	0.2678713077029217
  :	:
  (4452, 2438)	0.4574160733416501
  (4452, 7280)	0.3968991650168732
  (4452, 3978)	0.4574160733416501
  (4452, 3290)	0.26370969643076225
  (4452, 3084)	0.22948428918295163
  (4452, 2236)	0.2676662072392096
  (4453, 387

# Start Build Models

## Logistic Regression Model 

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
lr = LogisticRegression()
lr.fit(xnew, y_train)
print ("Accuracy Of Training Data = {}".format(lr.score(xnew, y_train)))
print ("Accuracy Of Testset Data = {}".format(lr.score(xtest, y_test)))

Accuracy Of Training Data = 0.9661207089970832
Accuracy Of Testset Data = 0.967713004484305


## KNN Model

In [15]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(xnew, y_train)

print ("Accuracy Of Training Data = {}".format(knn.score(xnew, y_train)))
print ("Accuracy Of Testset Data = {}".format(knn.score(xtest, y_test)))

Accuracy Of Training Data = 0.9165357864034104
Accuracy Of Testset Data = 0.9130044843049328


## SVM Model

In [25]:
from sklearn.svm import SVC
svm = SVC(kernel = 'rbf', random_state = 0)
svm.fit(xnew, y_train)

print ("Accuracy Of Training Data = {}".format(svm.score(xnew, y_train)))
print ("Accuracy Of Testset Data = {}".format(svm.score(xtest, y_test)))

Accuracy Of Training Data = 0.9986538030065066
Accuracy Of Testset Data = 0.9847533632286996


## Random Forest Model

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
rf.fit(xnew, y_train)

print ("Accuracy Of Training Data = {}".format(rf.score(xnew, y_train)))
print ("Accuracy Of Testset Data = {}".format(rf.score(xtest, y_test)))

Accuracy Of Training Data = 1.0
Accuracy Of Testset Data = 0.9802690582959641


### Notice that SVM Best Model For This Project Problem

#### Try to find out the best Model And paramter For this Model.

In [28]:
from sklearn.model_selection import GridSearchCV

paramterGrid = [{'C': [1, 10, 25, 50, 100], 'kernel': ['rbf', 'linear']},
                {'C': [1, 10, 25, 50, 100], 'kernel': ['sigmoid', 'poly'], 'gamma':[0.001, 0.01, 0.1, 0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}]

grid = GridSearchCV(estimator=svm, param_grid=paramterGrid, scoring='accuracy', cv=10)

grid.fit(xnew, y_train)

print("Best Estimator is  ",grid.best_estimator_)
print("Best Hyper Paramters is  ",grid.best_params_)
print("Best Score is  ",grid.best_score_)


Best Estimator is   SVC(C=10, gamma=0.4, kernel='sigmoid', random_state=0)
Best Hyper Paramters is   {'C': 10, 'gamma': 0.4, 'kernel': 'sigmoid'}
Best Score is   0.9811553383382879
