In [1]:
pip install tensorflow



In [2]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]


In [4]:
datasets = pd.read_csv('spam1.csv') 
print("\nData :\n",datasets)
print("\nData statistics\n",datasets.info())


Data :
        v1                                                 v2
0    spam  Free entry in 2 a wkly comp to win FA Cup fina...
1    spam  FreeMsg Hey there darling it's been 3 week's n...
2    spam  WINNER!! As a valued network customer you have...
3    spam  Had your mobile 11 months or more? U R entitle...
4    spam  SIX chances to win CASH! From 100 to 20,000 po...
..    ...                                                ...
508  spam  This is the 2nd time we have tried 2 contact u...
509   ham              Will �_ b going to esplanade fr home?
510   ham  Pity, * was in mood for that. So...any other s...
511   ham  The guy did some bitching but I acted like i'd...
512   ham                         Rofl. Its true to its name

[513 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 513 entries, 0 to 512
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   v1      513 non-null    object
 1   v2      513 non-nu

##Analysis

To analyze the text data, we have to turn the words into numerical numbers. 
We have multiple choices to accomplish this step: 

1) Binary Term Frequency :  count presence(1) or absence(0) for term in document

2) Bag of Words Frequency:  captures the frequency of term in document

3) Term Frequency: 

4) TFIDF :

in this way, if a term appears frequently in a document, it’s important; if a term appears in many documents, it’s not a unique identifier.

Word2Vec.

In [5]:
datasets.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,341,335,"Sorry, I'll call later",4
spam,172,168,Congratulations ur awarded 500 of CD vouchers ...,2


#Next we use CountVectorizer:

More Details and example at:

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [8]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Converting string labels into numbers.
datasets.iloc[:,0] = le.fit_transform(datasets.iloc[:,0])

X_train, X_test, Y_train, Y_test = train_test_split(datasets.iloc[:,1],
                        datasets.iloc[:,0], test_size = 0.30, random_state = 133)

v = CountVectorizer()
X_train_c = v.fit_transform(X_train.values)
X_train = X_train_c.toarray()

In [9]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

**Naive Bayes**

In [10]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [13]:
X_train2 = v.transform(X_test).toarray()
predicted = gnb.predict(X_train2)
print("Accuracy: {}".format(metrics.accuracy_score(Y_test, predicted)))

precision = precision_score(Y_test, predicted, average=None)
recall = recall_score(Y_test, predicted, average=None)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))

Accuracy: 0.9025974025974026
precision: [0.95061728 0.84931507]
recall: [0.875      0.93939394]


**Decision Tree**

In [14]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(criterion='gini', random_state=133, max_depth=10)
dt.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=10, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=133, splitter='best')

In [15]:
predicted = dt.predict(X_train2)
print("Accuracy: {}".format(metrics.accuracy_score(Y_test, predicted)))

precision = precision_score(Y_test, predicted, average=None)
recall = recall_score(Y_test, predicted, average=None)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))

Accuracy: 0.9090909090909091
precision: [0.88541667 0.94827586]
recall: [0.96590909 0.83333333]


**Optional Exercise:**
Try this on full spam.csv file and bigram matching instead of unigram matching 