In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/textdata/Amazon_Unlocked_Mobile.csv
/kaggle/input/textdata/newsgroups
/kaggle/input/textdata/mygrammar.cfg
/kaggle/input/textdata/UNHDR.pdf
/kaggle/input/textdata/moby.txt
/kaggle/input/textdata/paraphrases.csv
/kaggle/input/textdata/dates.txt
/kaggle/input/textdata/UNHDR.txt
/kaggle/input/textdata/spam.csv


In [2]:
spam_data = pd.read_csv('/kaggle/input/textdata/spam.csv')
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head()

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [3]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(spam_data['text'],
                                                spam_data['target'],
                                                random_state=0)

Percentage of document that are spam

In [4]:
len(spam_data[spam_data['target']==1])/len(spam_data['target'])*100

13.406317300789663

Fiting the training data X_train using a Count Vectorizer

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vocab = CountVectorizer()    
vocab = vocab.fit(X_train).vocabulary_
    
#we want only the keys i.e. words.
vocab = [words for words in vocab.keys()]
    
#store the length in the seperate list.
len_vocab = [len(words) for words in vocab]
    
#use the index of the longest token.
vocab[np.argmax(len_vocab)]

'com1win150ppmx3age16subscription'

Fitting a multinomial Naive Bayes classifier model with smoothing alpha=0.1

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score

cv = CountVectorizer().fit(X_train)
    
# Transform both X_train and X_test with the same CV object:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)
    
# Classifier for prediction:
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_cv, y_train)
preds_test = clf.predict(X_test_cv)
    
roc_auc_score(y_test, preds_test)

0.9720812182741116

Fitting and transforming the training data X_train using a Tfidf Vectorizer with default parameters.

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer().fit(X_train)
feature_names = np.array(tfidf.get_feature_names())
    
X_train_tf = tfidf.transform(X_train)
    
max_tf_idfs = X_train_tf.max(0).toarray()[0] # Get largest tfidf values across all documents.
sorted_tf_idxs = max_tf_idfs.argsort() # Sorted indices
sorted_tf_idfs = max_tf_idfs[sorted_tf_idxs] # Sorted TFIDF values
    
# feature_names doesn't need to be sorted! You just access it with a list of sorted indices!
smallest_tf_idfs = pd.Series(sorted_tf_idfs[:20], index=feature_names[sorted_tf_idxs[:20]])                    
largest_tf_idfs = pd.Series(sorted_tf_idfs[-20:][::-1], index=feature_names[sorted_tf_idxs[-20:][::-1]])
    
(smallest_tf_idfs, largest_tf_idfs)

(sympathetic     0.074475
 healer          0.074475
 aaniye          0.074475
 dependable      0.074475
 companion       0.074475
 listener        0.074475
 athletic        0.074475
 exterminator    0.074475
 psychiatrist    0.074475
 pest            0.074475
 determined      0.074475
 chef            0.074475
 courageous      0.074475
 stylist         0.074475
 psychologist    0.074475
 organizer       0.074475
 pudunga         0.074475
 venaam          0.074475
 diwali          0.091250
 mornings        0.091250
 dtype: float64,
 146tf150p    1.000000
 havent       1.000000
 home         1.000000
 okie         1.000000
 thanx        1.000000
 er           1.000000
 anything     1.000000
 lei          1.000000
 nite         1.000000
 yup          1.000000
 thank        1.000000
 ok           1.000000
 where        1.000000
 beerage      1.000000
 anytime      1.000000
 too          1.000000
 done         1.000000
 645          1.000000
 tick         0.980166
 blank        0.932702
 dt

Fitting and transforming the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 3.

Then fitting a multinomial Naive Bayes classifier model with smoothing alpha=0.1 and compute the area under the curve (AUC) score using the transformed test data.

In [8]:
tf = TfidfVectorizer(min_df=3).fit(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)
clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_tf, y_train)
pred = clf.predict(X_test_tf)
roc_auc_score(y_test, pred)

0.9416243654822335

The average length of documents (number of characters) for not spam and spam documents

In [9]:
len_spam = [len(x) for x in spam_data.loc[spam_data['target']==1, 'text']]
len_not_spam = [len(x) for x in spam_data.loc[spam_data['target']==0, 'text']]
(np.mean(len_not_spam), np.mean(len_spam))

(71.02362694300518, 138.8661311914324)

Combine new features into the training data:

In [10]:
from scipy.sparse import csr_matrix, hstack
def add_feature(X,feature_to_add):
    return hstack([X, csr_matrix(feature_to_add).T], 'csr')

Fitting and transforming the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 5.

Using this document-term matrix and an additional feature, the length of document (number of characters), fitting a Support Vector Classification model with regularization C=10000. Then computing the area under the curve (AUC) score using the transformed test data.

In [11]:
from sklearn.svm import SVC

len_train = [len(x) for x in X_train]
len_test = [len(x) for x in X_test]
    
tf = TfidfVectorizer(min_df=5).fit(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)
    
X_train_tf = add_feature(X_train_tf, len_train)
X_test_tf = add_feature(X_test_tf, len_test)
    
clf = SVC(C=10000)
clf.fit(X_train_tf, y_train)
pred = clf.predict(X_test_tf)
    
roc_auc_score(y_test, pred)

0.9661689557407943

The average number of digits per document for not spam and spam documents.

In [12]:
dig_spam = [sum(char.isnumeric() for char in x) for x in spam_data.loc[spam_data['target']==1,'text']]
dig_not_spam = [sum(char.isnumeric() for char in x) for x in spam_data.loc[spam_data['target']==0,'text']]
(np.mean(dig_not_spam), np.mean(dig_spam))

(0.2992746113989637, 15.76037483266399)

Fitting and transformig the training data X_train using a Tfidf Vectorizer ignoring terms that have a document frequency strictly lower than 5 and using word n-grams from n=1 to n=3 (unigrams, bigrams, and trigrams).

Using this document-term matrix and the following additional features:

the length of document (number of characters)
number of digits per document
fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

In [13]:
from sklearn.linear_model import LogisticRegression

dig_train = [sum(char.isnumeric() for char in x) for x in X_train]
dig_test = [sum(char.isnumeric() for char in x) for x in X_test]
    
tf = TfidfVectorizer(min_df = 5, ngram_range = (1,3)).fit(X_train)
X_train_tf = tf.transform(X_train)
X_test_tf = tf.transform(X_test)
    
X_train_tf = add_feature(X_train_tf, dig_train)
X_test_tf = add_feature(X_test_tf, dig_test)
    
clf = LogisticRegression(C=100).fit(X_train_tf, y_train)
pred = clf.predict(X_test_tf)
    
roc_auc_score(y_test, pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9678709064054463

The average number of non-word characters (anything other than a letter, digit or underscore) per document for not spam and spam documents

In [14]:
(np.mean(spam_data.loc[spam_data['target']==0,'text'].str.count('\W')), 
np.mean(spam_data.loc[spam_data['target']==1,'text'].str.count('\W')))

(17.29181347150259, 29.041499330655956)

Fit and transform the training data X_train using a Count Vectorizer ignoring terms that have a document frequency strictly lower than 5 and using character n-grams from n=2 to n=5.

To tell Count Vectorizer to use character n-grams pass in analyzer='char_wb' which creates character n-grams only from text inside word boundaries. This should make the model more robust to spelling mistakes.

Using this document-term matrix and the following additional features:

the length of document (number of characters)
number of digits per document
number of non-word characters (anything other than a letter, digit or underscore.)
fit a Logistic Regression model with regularization C=100. Then compute the area under the curve (AUC) score using the transformed test data.

Also find the 10 smallest and 10 largest coefficients from the model and return them along with the AUC score in a tuple.

The list of 10 smallest coefficients should be sorted smallest first, the list of 10 largest coefficients should be sorted largest first.

The three features that were added to the document term matrix should have the following names should they appear in the list of coefficients: ['length_of_doc', 'digit_count', 'non_word_char_count']

In [15]:
len_train = [len(x) for x in X_train]
len_test = [len(x) for x in X_test]
dig_train = [sum(char.isnumeric() for char in x) for x in X_train]
dig_test = [sum(char.isnumeric() for char in x) for x in X_test]
    
# Not alpha numeric:
nan_train = X_train.str.count('\W')
nan_test = X_test.str.count('\W')
    
cv = CountVectorizer(min_df = 5, ngram_range=(2,5), analyzer='char_wb').fit(X_train)
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)
    
X_train_cv = add_feature(X_train_cv, [len_train, dig_train, nan_train])
X_test_cv = add_feature(X_test_cv, [len_test, dig_test, nan_test])
    
clf = LogisticRegression(C=100).fit(X_train_cv, y_train)
pred = clf.predict(X_test_cv)
    
score = roc_auc_score(y_test, pred)
    
feature_names = np.array(cv.get_feature_names() + ['length_of_doc', 'digit_count', 'non_word_char_count'])
sorted_coef_index = clf.coef_[0].argsort()
small_coeffs = list(feature_names[sorted_coef_index[:10]])
large_coeffs = list(feature_names[sorted_coef_index[:-11:-1]])
    
(score, small_coeffs, large_coeffs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


(0.9809793219360643,
 ['..', '. ', ' i', ' y', ' go', '? ', 'ok', 'pe', 'go', ' h'],
 ['digit_count', 'ne', 'ww', 'co', 'xt', 'ia', 'uk', 'ar', ' ch', 'mob'])