In [2]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('Spam Email raw text for NLP.csv')

In [3]:
dataset

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216
3,1,IMPORTANT INFORMATION:\n\n\n\nThe new domain n...,00210.050ffd105bd4e006771ee63cabc59978
4,1,This is the bottom line. If you can GIVE AWAY...,00033.9babb58d9298daa2963d4f514193d7d6
...,...,...,...
5791,0,"I'm one of the 30,000 but it's not working ver...",00609.dd49926ce94a1ea328cce9b62825bc97
5792,0,Damien Morton quoted:\n\n>W3C approves HTML 4 ...,00957.e0b56b117f3ec5f85e432a9d2a47801f
5793,0,"On Mon, 2002-07-22 at 06:50, che wrote:\n\n\n\...",01127.841233b48eceb74a825417d8d918abf8
5794,0,"Once upon a time, Manfred wrote :\n\n\n\n> I w...",01178.5c977dff972cd6eef64d4173b90307f0


In [None]:
import matplotlib.pyplot as plt

In [4]:
dataset['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
from nltk.stem import WordNetLemmatizer
lematizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')

In [1]:
from sklearn.model_selection import train_test_split

In [17]:
def messageToTokenList(message):
  tokenizer = nltk.RegexpTokenizer(r"\w+")
  tokenizedMessage = tokenizer.tokenize(message)
  loweredcaseMessage = [i.lower() for i in tokenizedMessage]
  lemmatizedMessage = [lematizer.lemmatize(i) for i in loweredcaseMessage]
  messageWithUsefulWords = [i for i in lemmatizedMessage if i not in stopWords]
  return messageWithUsefulWords


In [18]:
messageToTokenList("DEar ahmed, tell me what did you do?")

['dear', 'ahmed', 'tell']

In [19]:
def countOfWords(messages):
  wordCounts = {}

  for message in messages:
    messageTokens = messageToTokenList(message)
    for token in messageTokens:
      if token not in wordCounts:
        wordCounts[token] = 1
      else:
        wordCounts[token]+=1
  return wordCounts

In [21]:
words_count = countOfWords(dataset['MESSAGE'])

In [22]:
def keepToken(token,threshold,word_count):
  if token not in word_count or word_count[token]<threshold:
    return False
  else:
    return True
keepToken('investor',200,words_count)

False

In [24]:
print(len(words_count))

98990


In [25]:
def assignImportantFeatures(tokens_count):
  features = set()
  for token in tokens_count:
    if keepToken(token,600,tokens_count):
      features.add(token)
  features = list(features)
  return features

In [30]:
features = assignImportantFeatures(words_count)
indexedFeatuers = {i:j for i,j in zip(features,range(len(features)))}
indexedFeatuers

{'0': 14,
 '00': 149,
 '000': 37,
 '000000': 129,
 '0000ff': 334,
 '01': 94,
 '02': 76,
 '09': 17,
 '0pt': 228,
 '1': 19,
 '10': 376,
 '100': 361,
 '11': 46,
 '12': 150,
 '13': 185,
 '14': 246,
 '15': 289,
 '18': 340,
 '19': 232,
 '2': 198,
 '20': 332,
 '2002': 9,
 '21': 99,
 '22': 196,
 '23': 308,
 '24': 12,
 '25': 331,
 '2e': 136,
 '2ffont': 381,
 '3': 50,
 '30': 42,
 '31': 163,
 '3b': 259,
 '3c': 352,
 '3cfont': 236,
 '3d': 190,
 '3d0': 73,
 '3d2': 36,
 '3d4': 359,
 '3darial': 216,
 '3dcenter': 280,
 '3e': 175,
 '4': 191,
 '5': 192,
 '50': 174,
 '6': 366,
 '7': 335,
 '8': 142,
 '81': 213,
 '8859': 329,
 '9': 350,
 '99': 377,
 '_': 71,
 '_______________________________________________': 102,
 'a3': 116,
 'access': 53,
 'ad': 279,
 'address': 91,
 'align': 165,
 'also': 225,
 'alt': 278,
 'another': 171,
 'anyone': 61,
 'application': 101,
 'arial': 248,
 'asp': 241,
 'available': 254,
 'b': 78,
 'b5': 209,
 'back': 226,
 'background': 355,
 'based': 330,
 'bb': 274,
 'best': 156,
 'b

In [33]:
def prepareFeatures(message,features):
  featuresVector = np.zeros(len(features))
  tokensList = messageToTokenList(message)
  for token in tokensList:
    if token not in features:
      continue
    index = indexedFeatuers[token]
    featuresVector[index]+=1
  return featuresVector


In [34]:
def returnCustomizedDataset(df):
  y = df['CATEGORY'].to_numpy().astype(int)
  featuresVector = []
  for i in df['MESSAGE']:
    smallVector = prepareFeatures(i,features)
    featuresVector.append(smallVector)
  X = np.array(featuresVector).astype(int)
  return X,y

In [41]:
X,Y = returnCustomizedDataset(dataset)

In [57]:
X.shape,Y.shape

((5796, 389), (5796,))

In [43]:
Y.shape

(5796,)

In [44]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 2, 0, ..., 4, 2, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [45]:
Y

array([1, 1, 1, ..., 0, 0, 0])

In [63]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size = 0.3)

In [47]:
y_test

array([1, 0, 0, ..., 1, 0, 0])

In [50]:
X_train.shape

(4057, 389)

In [60]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X)

X_train = scaler.transform(X)


In [None]:
X_train,y_train,X_test,y_test = 

In [61]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier().fit(X, Y)


In [62]:
rf.score(X,Y)

0.9998274672187716

In [64]:
rf2 = RandomForestClassifier().fit(X_train,y_train)


In [66]:
rf2.score(X_train,y_train)

1.0

In [67]:
rf2.score(X_test,y_test)

0.9810235767682576