In [2]:
import pandas as pd

data = pd.read_csv('spam.csv')
data.head(15)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [3]:
data.shape

(5572, 2)

In [4]:
data.label.unique()

array(['ham', 'spam'], dtype=object)

In [5]:
from sklearn.preprocessing import LabelEncoder

label_encoding = LabelEncoder()
data['label'] = label_encoding.fit_transform(data['label'].astype(str))

In [6]:
data.label.value_counts()

0    4825
1     747
Name: label, dtype: int64

### Feature Engineering and Feature Extraction

In [7]:
X = data['text']
Y = data['label']

In [8]:
X.shape, Y.shape

((5572,), (5572,))

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

tfidf_vectorizer = TfidfVectorizer(max_features=500)

x_train_trans = tfidf_vectorizer.fit_transform(x_train)
x_test_trans = tfidf_vectorizer.fit_transform(x_test)

x_train_trans.shape 

(4457, 500)

In [10]:
print(x_train_trans[0:3])

  (0, 295)	0.4349538211595591
  (0, 405)	0.3234370804711356
  (0, 303)	0.2862760416619019
  (0, 381)	0.384118325726554
  (0, 494)	0.18882367737905875
  (0, 37)	0.2963703082323165
  (0, 210)	0.24649783878705073
  (0, 214)	0.3585278948841798
  (0, 197)	0.4048574953844759
  (1, 485)	0.3021100498712657
  (1, 206)	0.18483090560657267
  (1, 48)	0.22872806505716636
  (1, 426)	0.22359882786306426
  (1, 413)	0.10890318323299449
  (1, 187)	0.1662190249648485
  (1, 72)	0.1785683267554718
  (1, 62)	0.28787768163835264
  (1, 151)	0.15741502018257367
  (1, 416)	0.2549796023863451
  (1, 356)	0.22550920261231552
  (1, 79)	0.35071318139309793
  (1, 202)	0.24422363107380843
  (1, 140)	0.305917509500928
  (1, 495)	0.1601445358211685
  (1, 212)	0.1477113680013789
  (1, 205)	0.1974355556041455
  (1, 494)	0.11163339641977518
  (1, 214)	0.2119632832740917
  (1, 197)	0.2393535487874407
  (2, 385)	0.24511821314069013
  (2, 29)	0.21731924698742486
  (2, 94)	0.17695363241392142
  (2, 126)	0.5445621948826709
  (2

### First Model

In [11]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(max_depth=15)
clf.fit(x_train_trans, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [12]:
y_pred = clf.predict(x_test_trans)
y_pred[0:4]

array([0, 0, 0, 0])

#### Evaluating First Model

In [13]:
from sklearn.metrics import accuracy_score
print("Accuracy Score : ", accuracy_score(y_test, y_pred))

Accuracy Score :  0.8511210762331839


In [14]:
df_y = pd.DataFrame({'y_test': y_test, 'y_prd': y_pred})
df_y.sample(10)

Unnamed: 0,y_test,y_prd
283,0,0
3502,0,0
3418,1,0
5203,0,0
1767,0,0
4264,0,0
2727,0,0
1373,1,0
2317,0,0
1680,0,0


## Second Model (Ensemble)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [16]:
estimator = []

estimator.append(('LR', LogisticRegression(C=1, solver='liblinear',max_iter = 200)))
estimator.append(('SVC', SVC(kernel='linear',gamma ='auto', probability = True)))
estimator.append(('DTC', DecisionTreeClassifier(max_depth=15)))

In [17]:
#Voting Classifier with hard voting
vot_hard = VotingClassifier(estimators = estimator, voting ='hard')

In [18]:
vot_hard_model = vot_hard.fit(x_train_trans, y_train)

In [19]:
y_pred = vot_hard_model.predict(x_test_trans)
y_pred.shape

(1115,)

In [20]:
from sklearn.metrics import accuracy_score
print("Accuracy Score : ", accuracy_score(y_test, y_pred))

Accuracy Score :  0.852914798206278


In [21]:
# Voting Classifier with soft voting
vot_soft = VotingClassifier(estimators = estimator, voting ='soft')
vot_soft_model = vot_soft.fit(x_train_trans, y_train)
y_pred = vot_soft_model.predict(x_test_trans)

In [22]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy Score : ", accuracy)

Accuracy Score :  0.8493273542600897


## Serializing 

In [23]:
import joblib
import sklearn

In [24]:
filename = 'models/voting_clf_soft_model.joblib'
scikit_learn_version = sklearn.__version__

model_params = {}

model_params['preprocessing'] = tfidf_vectorizer
model_params['model'] = vot_soft_model
model_params['sklearn_version'] = scikit_learn_version
model_params['accuracy'] = accuracy

joblib.dump(model_params, filename)

['models/voting_clf_soft_model.joblib']

### Load Joblib model and test it 

In [25]:
clf_checkpoint = joblib.load(filename)

In [37]:
reloaded_vect = clf_checkpoint['preprocessing']
reloaded_vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [27]:
clf_model = clf_checkpoint['model']
clf_model

VotingClassifier(estimators=[('LR',
                              LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=200,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='liblinear', tol=0.0001,
                                                 verbose=0, warm_start=False)),
                             ('SVC',
                              SVC(C=1.0, cache_size=200, class_weight=None,
                                  coef0=0.0, decision_functio...
                                  verbose=False)),
                             ('DTC',
                      

In [28]:
y_pred = clf_model.predict(x_test_trans)
y_pred


array([0, 0, 0, ..., 0, 0, 0])

In [29]:
accuracy_score(y_test, y_pred)

0.8493273542600897

In [30]:
model_params['accuracy']

0.8493273542600897