In [1]:
import tensorflow as tf
#from tf.keras.models import Sequential
#from tf.keras.layers import Dense
import os
import io

tf.__version__

'2.12.0'

In [2]:
# Download the zip file
path_to_zip = tf.keras.utils.get_file("smsspamcollection.zip",
                  origin="https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip",
                  extract=True)

# Unzip the file into a folder
!unzip $path_to_zip -d data

Downloading data from https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
   8192/Unknown - 0s 0us/stepArchive:  /root/.keras/datasets/smsspamcollection.zip
  inflating: data/SMSSpamCollection  
  inflating: data/readme             


In [3]:
lines = io.open('data/SMSSpamCollection').read().strip().split('\n')
lines[0]

'ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

# Pre-Process Data

In [4]:
spam_dataset = []
count = 0
for line in lines:
  label, text = line.split('\t')
  if label.lower().strip() == 'spam':
    spam_dataset.append((1, text.strip()))
    count += 1
  else:
    spam_dataset.append(((0, text.strip())))

print(spam_dataset[0])
print("Spam: ", count) # number of spams

(0, 'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...')
Spam:  747


## Data Normalization

In [5]:
import pandas as pd

In [6]:
df = pd.DataFrame(spam_dataset,columns=['Spam','Message'])

In [7]:
df.head(5)

Unnamed: 0,Spam,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
import re

# Normalization functions

def message_length(x):
  # returns total number of characters
  return len(x)

def num_capitals(x):
  _, count = re.subn(r'[A-Z]', '', x) # only works in english
  return count

def num_punctuation(x):
  _, count = re.subn(r'\W', '', x)
  return count

In [9]:
df['Capitals'] = df['Message'].apply(num_capitals)
df['Punctuation'] = df['Message'].apply(num_punctuation)
df['Length'] = df['Message'].apply(message_length)

In [10]:
df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488
std,0.340699,11.683233,14.825994,59.841746
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [11]:
train=df.sample(frac=0.8,random_state=42) #randomly select 80% of all data
test=df.drop(train.index)

In [12]:
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439
std,0.339359,11.405424,14.602023,59.346407
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,35.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,27.0,122.0
max,1.0,129.0,253.0,910.0


In [13]:
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length
count,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157
std,0.346116,12.731059,15.694599,61.807655
min,0.0,0.0,0.0,2.0
25%,0.0,1.0,8.0,36.0
50%,0.0,2.0,15.0,61.0
75%,0.0,4.0,28.0,123.0
max,1.0,127.0,195.0,790.0


# Model Building

In [14]:
# Basic 1-layer neural network model for evaluation
def make_model(input_dims=3, num_units=12):
  model = tf.keras.Sequential()

  # Adds a densely-connected layer with 12 units to the model:
  model.add(tf.keras.layers.Dense(num_units,
                                  input_dim=input_dims,
                                  activation='relu'))

  # Add a sigmoid layer with a binary output unit:
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam',
                metrics=['accuracy'])
  return model

In [15]:
x_train = train[['Length', 'Punctuation', 'Capitals']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals']]
y_test = test[['Spam']]

In [16]:
x_train

Unnamed: 0,Length,Punctuation,Capitals
3690,25,4,1
3527,161,48,107
724,40,7,1
3370,69,17,3
468,37,8,1
...,...,...,...
3280,444,114,44
3186,65,14,50
3953,81,23,2
2768,38,8,2


In [17]:
model = make_model()
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a77ac4bf7c0>

In [18]:
model.evaluate(x_test, y_test)



[0.2675186097621918, 0.9058296084403992]

In [19]:
y_train_pred = model.predict(x_train)



In [20]:
# confusion matrix
tf.math.confusion_matrix(tf.constant(y_train.Spam),
                         y_train_pred)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[3867,    0],
       [ 592,    0]], dtype=int32)>

In [21]:
sum(y_train_pred)

array([348.97226], dtype=float32)

In [22]:
y_test_pred = model.predict(x_test)
tf.math.confusion_matrix(tf.constant(y_test.Spam), y_test_pred)



<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[960,   0],
       [155,   0]], dtype=int32)>

# Tokenization and Stop Word Removal

In [23]:
sentence = 'Go until jurong point, crazy.. Available only in bugis n great world'
sentence.split()

['Go',
 'until',
 'jurong',
 'point,',
 'crazy..',
 'Available',
 'only',
 'in',
 'bugis',
 'n',
 'great',
 'world']

In [24]:
!pip install stanza  # StanfordNLP has become https://github.com/stanfordnlp/stanza/

Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting emoji (from stanza)
  Downloading emoji-2.7.0.tar.gz (361 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m361.8/361.8 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (pyproject.toml) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.7.0-py2.py3-none-any.whl size=356563 sha256=908051be1d2e87cd5af9a8baada649f61c57805aad063b5f2b512f14460cbd7c
  Stored in directory: /root/.cache/pip/wheels/41/11/48/5df0b9727d5669c9174a141134f10304d1d78a3b89a4676f3d
Successfully built emoji
Installing collected packages: emoj

In [25]:
import stanza
en = stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [26]:
en = stanza.Pipeline(lang='en')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [27]:
sentence

'Go until jurong point, crazy.. Available only in bugis n great world'

In [28]:
tokenized = en(sentence)
len(tokenized.sentences)

2

In [29]:
for snt in tokenized.sentences:
  for word in snt.tokens:
    print(word.text)
  print("<End of Sentence>")

Go
until
jurong
point
,
crazy
..
<End of Sentence>
Available
only
in
bugis
n
great
world
<End of Sentence>


# Dependency Parsing Example

In [30]:
en2 = stanza.Pipeline(lang='en')
pr2 = en2("Hari went to school")
for snt in pr2.sentences:
  for word in snt.tokens:
    print(word)
  print("<End of Sentence>")

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


[
  {
    "id": 1,
    "text": "Hari",
    "lemma": "Hari",
    "upos": "PROPN",
    "xpos": "NNP",
    "feats": "Number=Sing",
    "head": 2,
    "deprel": "nsubj",
    "start_char": 0,
    "end_char": 4,
    "ner": "S-PERSON",
    "multi_ner": [
      "S-PERSON"
    ]
  }
]
[
  {
    "id": 2,
    "text": "went",
    "lemma": "go",
    "upos": "VERB",
    "xpos": "VBD",
    "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
    "head": 0,
    "deprel": "root",
    "start_char": 5,
    "end_char": 9,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  }
]
[
  {
    "id": 3,
    "text": "to",
    "lemma": "to",
    "upos": "ADP",
    "xpos": "IN",
    "head": 4,
    "deprel": "case",
    "start_char": 10,
    "end_char": 12,
    "ner": "O",
    "multi_ner": [
      "O"
    ]
  }
]
[
  {
    "id": 4,
    "text": "school",
    "lemma": "school",
    "upos": "NOUN",
    "xpos": "NN",
    "feats": "Number=Sing",
    "head": 2,
    "deprel": "obl",
    "start_char": 13,
    "

# Adding Word Count Feature

In [31]:
def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = sum( [ len(sentence.tokens) for sentence in doc.sentences] )
  return count

#en = snlp.Pipeline(lang='en', processors='tokenize')
df['Words'] = df['Message'].apply(word_counts)

df.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words
count,5574.0,5574.0,5574.0,5574.0,5574.0
mean,0.134015,5.621636,18.942591,80.443488,18.691424
std,0.340699,11.683233,14.825994,59.841746,13.727627
min,0.0,0.0,0.0,2.0,1.0
25%,0.0,1.0,8.0,36.0,9.0
50%,0.0,2.0,15.0,61.0,15.0
75%,0.0,4.0,27.0,122.0,27.0
max,1.0,129.0,253.0,910.0,206.0


In [32]:
train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)
model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a77b2586020>

In [33]:
model.evaluate(x_test, y_test)



[0.19846950471401215, 0.9354259967803955]

# Stop Word Removal

In [34]:
!pip install stopwordsiso

Collecting stopwordsiso
  Downloading stopwordsiso-0.6.1-py3-none-any.whl (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.5/73.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: stopwordsiso
Successfully installed stopwordsiso-0.6.1


In [35]:
import stopwordsiso as stopwords

stopwords.langs()

{'af',
 'ar',
 'bg',
 'bn',
 'br',
 'ca',
 'cs',
 'da',
 'de',
 'el',
 'en',
 'eo',
 'es',
 'et',
 'eu',
 'fa',
 'fi',
 'fr',
 'ga',
 'gl',
 'gu',
 'ha',
 'he',
 'hi',
 'hr',
 'hu',
 'hy',
 'id',
 'it',
 'ja',
 'ko',
 'ku',
 'la',
 'lt',
 'lv',
 'mr',
 'ms',
 'nl',
 'no',
 'pl',
 'pt',
 'ro',
 'ru',
 'sk',
 'sl',
 'so',
 'st',
 'sv',
 'sw',
 'th',
 'tl',
 'tr',
 'uk',
 'ur',
 'vi',
 'yo',
 'zh',
 'zu'}

In [36]:
sorted(stopwords.stopwords('en'))

["'ll",
 "'tis",
 "'twas",
 "'ve",
 '10',
 '39',
 'a',
 "a's",
 'able',
 'ableabout',
 'about',
 'above',
 'abroad',
 'abst',
 'accordance',
 'according',
 'accordingly',
 'across',
 'act',
 'actually',
 'ad',
 'added',
 'adj',
 'adopted',
 'ae',
 'af',
 'affected',
 'affecting',
 'affects',
 'after',
 'afterwards',
 'ag',
 'again',
 'against',
 'ago',
 'ah',
 'ahead',
 'ai',
 "ain't",
 'aint',
 'al',
 'all',
 'allow',
 'allows',
 'almost',
 'alone',
 'along',
 'alongside',
 'already',
 'also',
 'although',
 'always',
 'am',
 'amid',
 'amidst',
 'among',
 'amongst',
 'amoungst',
 'amount',
 'an',
 'and',
 'announce',
 'another',
 'any',
 'anybody',
 'anyhow',
 'anymore',
 'anyone',
 'anything',
 'anyway',
 'anyways',
 'anywhere',
 'ao',
 'apart',
 'apparently',
 'appear',
 'appreciate',
 'appropriate',
 'approximately',
 'aq',
 'ar',
 'are',
 'area',
 'areas',
 'aren',
 "aren't",
 'arent',
 'arise',
 'around',
 'arpa',
 'as',
 'aside',
 'ask',
 'asked',
 'asking',
 'asks',
 'associated

In [37]:
en_sw = stopwords.stopwords('en')

def word_counts(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          count += 1
  return count

#Bu metod, metin belgesindeki İngilizce stop kelimeleri hariç olmak üzere toplam kelime sayısını döndürür.
#Bu sayede, metinlerdeki gerçek kelime içeriği analiz edilirken gereksiz veya sık kullanılan kelimeler dikkate alınmaz.

train['Words'] = train['Message'].apply(word_counts)
test['Words'] = test['Message'].apply(word_counts)
x_train = train[['Length', 'Punctuation', 'Capitals', 'Words']]
y_train = train[['Spam']]

x_test = test[['Length', 'Punctuation', 'Capitals' , 'Words']]
y_test = test[['Spam']]

model = make_model(input_dims=4)
#model = make_model(input_dims=3)

model.fit(x_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a74a2dddb40>

# POS Based Features

In [38]:
en = stanza.Pipeline(lang='en')

txt = "Yo you around? A friend of mine's lookin."
pos = en(txt)

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| constituency | wsj       |
| depparse     | combined  |
| sentiment    | sstplus   |
| ner          | ontonotes |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: constituency
INFO:stanza:Loading: depparse
INFO:stanza:Loading: sentiment
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


In [39]:
def print_pos(doc):
    text = ""
    for sentence in doc.sentences:
        for token in sentence.tokens:
            text += token.words[0].text + "/" + \
                    token.words[0].upos + " "
        text += "\n"
    return text
print(print_pos(pos))

Yo/PRON you/PRON around/ADV ?/PUNCT 
A/DET friend/NOUN of/ADP mine/PRON 's/PART lookin/NOUN ./PUNCT 



In [40]:
en_sw = stopwords.stopwords('en')

def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  count = 0
  for sentence in doc.sentences:
    for token in sentence.tokens:
        if token.text.lower() not in en_sw and \
        token.words[0].upos not in ['PUNCT', 'SYM']:
          count += 1
  return count
print(word_counts(txt), word_counts_v3(txt))

6 4


In [41]:
train['Test'] = 0
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.164611,0.0
std,0.339359,11.405424,14.602023,59.346407,7.902972,0.0
min,0.0,0.0,0.0,2.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,0.0
50%,0.0,2.0,15.0,61.0,7.0,0.0
75%,0.0,4.0,27.0,122.0,13.0,0.0
max,1.0,129.0,253.0,910.0,147.0,0.0


In [42]:
def word_counts_v3(x, pipeline=en):
  doc = pipeline(x)
  totals = 0.
  count = 0.
  non_word = 0.
  for sentence in doc.sentences:
    totals += len(sentence.tokens)  # (1)
    for token in sentence.tokens:
        if token.text.lower() not in en_sw:
          if token.words[0].upos not in ['PUNCT', 'SYM']:
            count += 1.
          else:
            non_word += 1.
  non_word = non_word / totals
  return pd.Series([count, non_word], index=['Words_NoPunct', 'Punct'])
x = train[:10]
x.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,14.4,18.3,72.7,8.4,0.0
std,0.0,32.948445,14.772723,50.36103,10.046558,0.0
min,0.0,1.0,4.0,23.0,2.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0
50%,0.0,1.5,13.0,57.0,4.0,0.0
75%,0.0,9.0,23.75,88.0,10.5,0.0
max,0.0,107.0,48.0,161.0,35.0,0.0


In [43]:
train_tmp = train['Message'].apply(word_counts_v3)
train = pd.concat([train, train_tmp], axis=1)
train.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.132765,5.519399,18.886522,80.316439,9.164611,0.0,6.47791,0.144635
std,0.339359,11.405424,14.602023,59.346407,7.902972,0.0,5.694648,0.089125
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,35.0,4.0,0.0,3.0,0.090909
50%,0.0,2.0,15.0,61.0,7.0,0.0,5.0,0.142857
75%,0.0,4.0,27.0,122.0,13.0,0.0,9.0,0.2
max,1.0,129.0,253.0,910.0,147.0,0.0,72.0,0.777778


In [44]:
test_tmp = test['Message'].apply(word_counts_v3)
test = pd.concat([test, test_tmp], axis=1)
test.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Words_NoPunct,Punct
count,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0,1115.0
mean,0.139013,6.030493,19.166816,80.95157,9.418834,6.669955,0.147059
std,0.346116,12.731059,15.694599,61.807655,8.150151,5.864917,0.095655
min,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,8.0,36.0,4.0,3.0,0.09375
50%,0.0,2.0,15.0,61.0,7.0,5.0,0.142857
75%,0.0,4.0,28.0,123.0,13.0,9.0,0.2
max,1.0,127.0,195.0,790.0,82.0,47.0,1.0


In [45]:
z = pd.concat([x, train_tmp], axis=1)
z.describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,10.0,10.0,10.0,10.0,10.0,10.0,4459.0,4459.0
mean,0.0,14.4,18.3,72.7,8.4,0.0,6.47791,0.144635
std,0.0,32.948445,14.772723,50.36103,10.046558,0.0,5.694648,0.089125
min,0.0,1.0,4.0,23.0,2.0,0.0,0.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0,3.0,0.090909
50%,0.0,1.5,13.0,57.0,4.0,0.0,5.0,0.142857
75%,0.0,9.0,23.75,88.0,10.5,0.0,9.0,0.2
max,0.0,107.0,48.0,161.0,35.0,0.0,72.0,0.777778


In [46]:
z.loc[z['Spam']==0].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.0,14.4,18.3,72.7,8.4,0.0,5.4,0.151178
std,0.0,32.948445,14.772723,50.36103,10.046558,0.0,7.381659,0.062156
min,0.0,1.0,4.0,23.0,2.0,0.0,1.0,0.0
25%,0.0,1.0,7.25,37.75,3.0,0.0,2.0,0.135442
50%,0.0,1.5,13.0,57.0,4.0,0.0,2.0,0.166667
75%,0.0,9.0,23.75,88.0,10.5,0.0,6.0,0.196875
max,0.0,107.0,48.0,161.0,35.0,0.0,25.0,0.208333


In [47]:
z.loc[z['Spam']==1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,,,,,,,
std,,,,,,,,
min,,,,,,,,
25%,,,,,,,,
50%,,,,,,,,
75%,,,,,,,,
max,,,,,,,,


In [48]:
aa = [word_counts_v3(y) for y in x['Message']]
ab = pd.DataFrame(aa)
ab.describe()

Unnamed: 0,Words_NoPunct,Punct
count,10.0,10.0
mean,5.4,0.151178
std,7.381659,0.062156
min,1.0,0.0
25%,2.0,0.135442
50%,2.0,0.166667
75%,6.0,0.196875
max,25.0,0.208333


## TF-IDF Based Model

In [49]:
# if not installed already
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.post7.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post7-py3-none-any.whl size=2951 sha256=0df6a14b0fed1d8213d9d42582f860ac9d7b15a61cf65e43855bb30cf6271f84
  Stored in directory: /root/.cache/pip/wheels/c8/9c/85/72901eb50bc4bc6e3b2629378d172384ea3dfd19759c77fd2c
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post7


In [50]:
corpus = [
          "I like fruits. Fruits like bananas",
          "I love bananas but eat an apple",
          "An apple a day keeps the doctor away"
]

## Count Vectorization

In [67]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

vectorizer.get_feature_names_out()

array(['an', 'apple', 'away', 'bananas', 'but', 'day', 'doctor', 'eat',
       'fruits', 'keeps', 'like', 'love', 'the'], dtype=object)

In [68]:
X.toarray()


array([[0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 0, 0],
       [1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1]])

In [69]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(X.toarray())

array([[1.        , 0.13608276, 0.        ],
       [0.13608276, 1.        , 0.3086067 ],
       [0.        , 0.3086067 , 1.        ]])

In [70]:
query = vectorizer.transform(["apple and bananas"])

cosine_similarity(X, query)

array([[0.23570226],
       [0.57735027],
       [0.26726124]])

## TF-IDF Vectorization


In [71]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer(smooth_idf=False)
tfidf = transformer.fit_transform(X.toarray())

pd.DataFrame(tfidf.toarray(),
             columns=vectorizer.get_feature_names_out())

Unnamed: 0,an,apple,away,bananas,but,day,doctor,eat,fruits,keeps,like,love,the
0,0.0,0.0,0.0,0.230408,0.0,0.0,0.0,0.0,0.688081,0.0,0.688081,0.0,0.0
1,0.321267,0.321267,0.0,0.321267,0.479709,0.0,0.0,0.479709,0.0,0.0,0.0,0.479709,0.0
2,0.275785,0.275785,0.411797,0.0,0.0,0.411797,0.411797,0.0,0.0,0.411797,0.0,0.0,0.411797


In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

tfidf = TfidfVectorizer(binary=True)
X = tfidf.fit_transform(train['Message']).astype('float32')
X_test = tfidf.transform(test['Message']).astype('float32')

In [73]:
X.shape

(4459, 7741)

In [74]:
from keras.utils import np_utils

_, cols = X.shape
model2 = make_model(cols)  # to match tf-idf dimensions
lb = LabelEncoder()
y = lb.fit_transform(y_train)
dummy_y_train = np_utils.to_categorical(y)
model2.fit(X.toarray(), y_train, epochs=10, batch_size=10)

  y = column_or_1d(y, warn=True)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a74a7082260>

In [75]:
model2.evaluate(X_test.toarray(), y_test)



[0.0575413778424263, 0.9838564991950989]

In [76]:
train.loc[train.Spam == 1].describe()

Unnamed: 0,Spam,Capitals,Punctuation,Length,Words,Test,Words_NoPunct,Punct
count,592.0,592.0,592.0,592.0,592.0,592.0,592.0,592.0
mean,1.0,15.320946,29.086149,138.856419,18.195946,0.0,13.95777,0.141268
std,0.0,11.635105,7.083572,28.07998,5.968228,0.0,4.552373,0.065293
min,1.0,0.0,2.0,13.0,2.0,0.0,2.0,0.0
25%,1.0,7.0,26.0,132.0,14.0,0.0,11.0,0.096774
50%,1.0,14.0,30.0,149.0,18.0,0.0,14.0,0.137931
75%,1.0,21.0,34.0,157.0,23.0,0.0,17.0,0.181818
max,1.0,128.0,49.0,197.0,31.0,0.0,25.0,0.342105


## Word Vectors

In [77]:
# memory limit may be exceeded. Try deleting some objects before running this next section
# or copy this section to a different notebook.
!pip install gensim



In [78]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

In [79]:
api.info()

{'corpora': {'semeval-2016-2017-task3-subtaskBC': {'num_records': -1,
   'record_format': 'dict',
   'file_size': 6344358,
   'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/semeval-2016-2017-task3-subtaskB-eng/__init__.py',
   'license': 'All files released for the task are free for general research use',
   'fields': {'2016-train': ['...'],
    '2016-dev': ['...'],
    '2017-test': ['...'],
    '2016-test': ['...']},
   'description': 'SemEval 2016 / 2017 Task 3 Subtask B and C datasets contain train+development (317 original questions, 3,169 related questions, and 31,690 comments), and test datasets in English. The description of the tasks and the collected data is given in sections 3 and 4.1 of the task paper http://alt.qcri.org/semeval2016/task3/data/uploads/semeval2016-task3-report.pdf linked in section “Papers” of https://github.com/RaRe-Technologies/gensim-data/issues/18.',
   'checksum': '701ea67acd82e75f95e1d8e62fb0ad29',
   'file_name': 'se

In [80]:
model_w2v = api.load("word2vec-google-news-300")



In [81]:
model_w2v.most_similar("cookies",topn=10)

[('cookie', 0.745154082775116),
 ('oatmeal_raisin_cookies', 0.6887780427932739),
 ('oatmeal_cookies', 0.6621399521827698),
 ('cookie_dough_ice_cream', 0.6520504951477051),
 ('brownies', 0.6479345560073853),
 ('homemade_cookies', 0.6476464867591858),
 ('gingerbread_cookies', 0.6461867690086365),
 ('Cookies', 0.6341644525527954),
 ('cookies_cupcakes', 0.6275069117546082),
 ('cupcakes', 0.6258295178413391)]

In [82]:
model_w2v.doesnt_match(["USA","Canada","India","Tokyo"])

'Tokyo'

In [83]:
king = model_w2v['king']
man = model_w2v['man']
woman = model_w2v['woman']

queen = king - man + woman
model_w2v.similar_by_vector(queen)

[('king', 0.8449392318725586),
 ('queen', 0.7300517559051514),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676352500916),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376775860786438),
 ('Queen_Consort', 0.5344247817993164),
 ('queens', 0.5289887189865112)]