# Model Improvement

## Feature selection

Importing the dataset

In [16]:
import pandas as pd
from nltk.corpus import stopwords
import gensim
import numpy as np


dataset=pd.read_csv("sms_spam.csv")

print(dataset.head())
print ("Shape:", dataset.shape, '\n')

   type                                               text
0   ham  Hope you are having a good week. Just checking in
1   ham                            K..give back my thanks.
2   ham        Am also doing in cbe only. But have to pay.
3  spam  complimentary 4 STAR Ibiza Holiday or £10,000 ...
4  spam  okmail: Dear Dave this is your final notice to...
Shape: (5559, 2) 



Preprocessing function

In [17]:
def transformText(text):
    stops = set(stopwords.words("english"))
    # Convert text to lowercase
    text = text.lower()
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation(text)
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    # Removing all the words with < 3 characters
    text = gensim.parsing.preprocessing.strip_short(text, minsize=3)
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

Preprocessing

In [18]:
#applies transformText to all rows of text
dataset['text'] = dataset['text'].map(transformText)
print(dataset['text'].head())

0                                 hope good week check
1                                      give back thank
2                                    also cbe onli pai
3    complimentari star ibiza holidai cash need urg...
4    okmail dear dave final notic collect tenerif h...
Name: text, dtype: object


Creating training and test set

In [22]:
## Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset['text'], dataset['type'],
                                                    test_size=0.33, random_state=10)

print ("Training Sample Size:", len(X_train), ' ', "Test Sample Size:" ,len(X_test))

Training Sample Size: 3724   Test Sample Size: 1835


Creating a tf-idf model

In [23]:
#Build the counting corpus
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

## Get the TF-IDF vector representation of the data
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print ('Dimension of TF-IDF vector :' , X_train_tfidf.shape)

Dimension of TF-IDF vector : (3724, 5056)


Performing feature selection

In [24]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

#selecting the best 2000 features
selector = SelectKBest(chi2, k=2000)
X_new=selector.fit_transform(X_train_tfidf, y_train)
print ('Dimension of TF-IDF vector :' , X_new.shape)


Dimension of TF-IDF vector : (3724, 2000)


Select the top k% features

In [19]:
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
selector = SelectPercentile(chi2,percentile=40)
X_new=selector.fit_transform(X_train_tfidf, y_train)
print ('Dimension of TF-IDF vector :' , X_new.shape)

Dimension of TF-IDF vector : (3724, 2022)


In [20]:
from sklearn.naive_bayes import MultinomialNB
# Fitting the model
clf = MultinomialNB()
clf.fit(X_new, y_train)


#Performing the prediction

#indexing the test set
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
# folds the test set into the selected features
# (i.e., it removes unused features)
X_new_sel=selector.transform(X_new_tfidf)
#performing the actual prediction
predicted = clf.predict(X_new_sel)

print(predicted)
print(np.mean(predicted==y_test))

['ham' 'ham' 'ham' ... 'spam' 'ham' 'spam']
0.9722070844686649


Printing metrics

In [21]:
from sklearn import metrics
print(metrics.classification_report(y_test,predicted))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1583
        spam       0.99      0.81      0.89       252

    accuracy                           0.97      1835
   macro avg       0.98      0.90      0.94      1835
weighted avg       0.97      0.97      0.97      1835



## Training set rebalancing

### Undersampling

In [13]:
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
# instantiates the undersampler. “majority” means that 
# the majority class will be undersampled to match the minority one
undersample = RandomUnderSampler(sampling_strategy='majority')
# undersamples the training set
X_new_train, y_train = undersample.fit_resample(X_train_tfidf, y_train)
# prints the dataset composition
counter=Counter(y_train)
print(counter)

Counter({'ham': 495, 'spam': 495})


### Random oversampling

In [21]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
#instantiate the random oversampler class. "minority" means that the 
# minority class will be oversampled to match the majority class
oversample = RandomOverSampler(sampling_strategy='minority')

# Rebalances the training set by resampling 
X_new_train, y_train = oversample.fit_resample(X_train_tfidf, y_train)
# prints the dataset composition
counter=Counter(y_train)
print(counter)


Counter({'spam': 3229, 'ham': 3229})


### SMOTE oversampling

In [25]:
from imblearn.over_sampling import SMOTE
from collections import Counter

#instantiate the SMOTE oversampler
oversample = SMOTE()

# Rebalances the training set by creating artificial instances
# of the minority class.
X_new_train, y_train = oversample.fit_resample(X_train_tfidf, y_train)
# prints the dataset composition
counter=Counter(y_train)
print(counter)

Counter({'spam': 3229, 'ham': 3229})


In [26]:
from sklearn.naive_bayes import MultinomialNB
# Fitting the model
clf = MultinomialNB()
clf.fit(X_new_train, y_train)


#Performing the prediction

#indexing the test set
X_new_counts = count_vect.transform(X_test)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

#performing the actual prediction
predicted = clf.predict(X_new_tfidf)

print(predicted)
print(np.mean(predicted==y_test))
from sklearn import metrics
print(metrics.classification_report(y_test,predicted))

['ham' 'ham' 'ham' ... 'spam' 'ham' 'spam']
0.9694822888283379
              precision    recall  f1-score   support

         ham       0.99      0.97      0.98      1583
        spam       0.84      0.97      0.90       252

    accuracy                           0.97      1835
   macro avg       0.92      0.97      0.94      1835
weighted avg       0.97      0.97      0.97      1835

