In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn import model_selection
from sklearn import feature_extraction
from sklearn import linear_model
from sklearn import svm
from sklearn import metrics

In [3]:
import nltk
import string
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [4]:
np.random.seed(7)

TSV file <code>'data/SMSSpamCollection.tsv'</code> contains a collection of SMS messages with associated labels
- 'spam' - if the message is spam
- 'ham' - if the message is not spam

The task is to apply the presented techniques for preprocessing, perform feature extraction and make a classifier that distinguishes spam and non-spam messages.

1) Load dataset.

In [5]:
data = pd.read_csv('data/SMSSpamCollection.tsv', sep='\t', header=None)

In [6]:
data.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


2) Extract class labels and text corpus separately. 

In [7]:
x = data[1]
y = data[0].apply(lambda x : int(x == 'spam'))

3) Split dataset into a training and test subsets in ratio 2:1, with stratification (parameter <code>stratify</code>). Set <code>random_state</code> parameter to 7.

In [8]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.33, stratify=y, random_state=7)

4) Define a function which will be used for text tokenization and preprocessing (use all of the presented procedures).

In [9]:
def sms_stem_tokenizer(sms_text):
    sms_tokens = nltk.tokenize.word_tokenize(sms_text)
    
    sms_stems = []
    stemmer = nltk.stem.PorterStemmer()
    
    for token in sms_tokens:
        if token in string.punctuation:
            continue
        if token in stopwords.words('english'):
            continue
        if token.isdigit():
            continue
            
        stem = stemmer.stem(token)
        sms_stems.append(stem)
        
    return sms_stems

In [10]:
def get_wordnet_pos_tag(token):
    pos_tag_dict = {
        'N' : 'n',
        'V' : 'v',
        'J' : 'a',
        'R' : 'r'
    }
    
    penn_pos_tag = nltk.pos_tag([token])[0][1][0]
    
    if penn_pos_tag in pos_tag_dict:
        return pos_tag_dict[penn_pos_tag]
    else:
        return 'n'

In [11]:
def sms_lemma_tokenizer(sms_text):
    sms_tokens = nltk.tokenize.word_tokenize(sms_text)
    
    sms_lemmas = []
    
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    for token in sms_tokens:
        if token in string.punctuation:
            continue
        if token in stopwords.words('english'):
            continue
        if token.isdigit():
            continue
            
        pos_tag = get_wordnet_pos_tag(token)
        lemma = lemmatizer.lemmatize(token, pos_tag)
        sms_lemmas.append(lemma)
        
    return sms_lemmas

5) Perform feature extraction using some of the presented methods. When building a vocabulary, exclude words that appear less than 2 times.

In [12]:
count_vectorizer_1 = feature_extraction.text.CountVectorizer(min_df=2, tokenizer=sms_stem_tokenizer)

In [13]:
count_vectorizer_1.fit(x_train)



CountVectorizer(min_df=2,
                tokenizer=<function sms_stem_tokenizer at 0x7f0cf55ac820>)

In [14]:
x_train_vectorized_1 = count_vectorizer_1.transform(x_train)
x_test_vectorized_1 = count_vectorizer_1.transform(x_test)

In [15]:
count_vectorizer_2 = feature_extraction.text.CountVectorizer(min_df=2, tokenizer=sms_lemma_tokenizer)

In [16]:
count_vectorizer_2.fit(x_train)

CountVectorizer(min_df=2,
                tokenizer=<function sms_lemma_tokenizer at 0x7f0cf44d5c10>)

In [17]:
x_train_vectorized_2 = count_vectorizer_2.transform(x_train)
x_test_vectorized_2 = count_vectorizer_2.transform(x_test)

6) Take a look at the size of the built vocabulary. Compare it to the size of the vocabulary obtained without text preprocessing.

In [18]:
len(count_vectorizer_1.get_feature_names())

2676

In [19]:
len(count_vectorizer_2.get_feature_names())

2708

Note that we got a much smaller vocabulary compared to the vocabulary obtained without the application of stemming/lemmatization which had 3209 words. This can be a good thing because reducing number of features also reduces the capacity of the model to memorize the training data, ie reduces the chance of overfitting. On the other hand, stemming as well as lemmatization may cause loss of information, or even changed meaning of sentences.

7) Train several types of classification models, evaluate them and compare with the previous ones.

In [20]:
model1 = linear_model.LogisticRegression()

In [21]:
model1.fit(x_train_vectorized_1, y_train)

LogisticRegression()

In [22]:
model2 = linear_model.LogisticRegression()

In [23]:
model2.fit(x_train_vectorized_2, y_train)

LogisticRegression()

Accuracy of the model on the training and test set:

In [24]:
model1.score(x_train_vectorized_1, y_train)

0.993302973479775

In [25]:
model1.score(x_test_vectorized_1, y_test)

0.9798803697661773

In [26]:
model2.score(x_train_vectorized_2, y_train)

0.993570854540584

In [27]:
model2.score(x_test_vectorized_2, y_test)

0.9798803697661773

Confusion matrix:

In [28]:
y_test_predicted = model1.predict(x_test_vectorized_1)

In [29]:
metrics.confusion_matrix(y_test, y_test_predicted)

array([[1590,    2],
       [  35,  212]])

In [30]:
y_test_predicted = model2.predict(x_test_vectorized_2)

In [31]:
metrics.confusion_matrix(y_test, y_test_predicted)

array([[1589,    3],
       [  34,  213]])

Results are slightly worse then in the case of the corresponding models trained on representations obtained without preprocessing and stemming/lemmatization.