### Importing The Dataset

In [88]:
import chardet

encoding_train = ""
encoding_test = ""

with open("./TrainingData.csv", "rb") as f:
    result = chardet.detect(f.read())
    encoding_train = result.get('encoding', 'UTF-8')

with open("./TestData.csv", "rb") as f:
    result = chardet.detect(f.read())
    encoding_test = result.get('encoding', 'UTF-8')

print('ENCODING')
print("Train: " + encoding_train)
print("Test: " + encoding_test)

ENCODING
Train: Windows-1252
Test: Windows-1252


In [89]:
import pandas as pd

df_train = pd.read_csv('./TrainingData.csv', encoding=encoding)
df_test = pd.read_csv('./TestData.csv', encoding=encoding)

#### Surveying the Data

In [90]:
df_train.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [91]:
df_train["label"].value_counts()

label
ham     3381
spam     519
Name: count, dtype: int64

In [92]:
df_test.head()

Unnamed: 0,message
0,That depends. How would you like to be treated...
1,"Right on brah, see you later"
2,Waiting in e car 4 my mum lor. U leh? Reach ho...
3,Your 2004 account for 07XXXXXXXXX shows 786 un...
4,Do you want a new video handset? 750 anytime a...


In [93]:
import nltk

nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /home/miguel/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/miguel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Practice

In [94]:
# Practice
tokenizer = nltk.RegexpTokenizer(r"\w+")
test_message = "Hey,, GGggGG feet it going? <HTML><bads> bads 'randoms' badly"

test_message_tokenized = tokenizer.tokenize(test_message)
test_message_tokenized

['Hey',
 'GGggGG',
 'feet',
 'it',
 'going',
 'HTML',
 'bads',
 'bads',
 'randoms',
 'badly']

In [95]:
test_message_lowercased = [t.lower() for t in test_message_tokenized]
test_message_lowercased

['hey',
 'gggggg',
 'feet',
 'it',
 'going',
 'html',
 'bads',
 'bads',
 'randoms',
 'badly']

In [96]:
# lemmatize - sort words by grouping inflected or variant forms of the same word
# group by meaning, not by string variants

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

test_message_lemmatized_tokens = [lemmatizer.lemmatize(t) for t in test_message_lowercased]
test_message_lemmatized_tokens

['hey',
 'gggggg',
 'foot',
 'it',
 'going',
 'html',
 'bad',
 'bad',
 'randoms',
 'badly']

In [97]:
def message_to_token_list(s):
  tokens = tokenizer.tokenize(s)
  lowercased_tokens = [t.lower() for t in tokens]
  lemmatized_tokens = [lemmatizer.lemmatize(t) for t in lowercased_tokens]
  useful_tokens = [t for t in lemmatized_tokens if t not in stopwords]

  return useful_tokens

message_to_token_list(test_message)

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [98]:
from nltk.corpus import stopwords # a set of commonly used words in any language / unimportant words

stopwords = stopwords.words('english')

test_message_useful_tokens = [t for t in test_message_lemmatized_tokens if t not in stopwords]
test_message_useful_tokens

['hey', 'gggggg', 'foot', 'going', 'html', 'bad', 'bad', 'randoms', 'badly']

In [99]:
df_train['label'] = df_train['label'].map({'ham': 0, 'spam': 1})
df_train

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
3895,1,tells u 2 call 09066358152 to claim å£5000 pri...
3896,0,No. Thank you. You've been wonderful
3897,0,Otherwise had part time job na-tuition..
3898,0,ÌÏ mean it's confirmed... I tot they juz say o...


In [100]:
df_train = df_train.sample(frac=1, random_state=1)
df_train = df_train.reset_index(drop=True)

split_index = int(len(df) * 0.8)
train_df, test_df = df[:split_index], df[split_index:]

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

train_df, test_df

(     label                                            message
 0      ham  Go until jurong point, crazy.. Available only ...
 1      ham                      Ok lar... Joking wif u oni...
 2     spam  Free entry in 2 a wkly comp to win FA Cup fina...
 3      ham  U dun say so early hor... U c already then say...
 4      ham  Nah I don't think he goes to usf, he lives aro...
 ...    ...                                                ...
 3115   ham                Uncle Abbey! Happy New Year. Abiola
 3116   ham                            Now am free call me pa.
 3117   ham  R u saying i should re order the slippers cos ...
 3118   ham                           Stop knowing me so well!
 3119   ham          Good evening! this is roger. How are you?
 
 [3120 rows x 2 columns],
     label                                            message
 0     ham   Small problem in auction:)punj now asking tiwary
 1    spam  Free entry in 2 a weekly comp for a chance to ...
 2     ham  He telling not to 

In [101]:
token_counter = {}

for message in train_df['message']:
  message_as_token_lst = message_to_token_list(message)

  for token in message_as_token_lst:
    if token in token_counter:
      token_counter[token] += 1
    else:
      token_counter[token] = 1

len(token_counter)

5901

In [102]:
token_counter

{'go': 171,
 'jurong': 1,
 'point': 19,
 'crazy': 9,
 'available': 10,
 'bugis': 5,
 'n': 88,
 'great': 63,
 'world': 28,
 'la': 4,
 'e': 61,
 'buffet': 2,
 'cine': 5,
 'got': 133,
 'amore': 1,
 'wat': 54,
 'ok': 172,
 'lar': 23,
 'joking': 2,
 'wif': 13,
 'u': 684,
 'oni': 2,
 'free': 152,
 'entry': 18,
 '2': 293,
 'wkly': 9,
 'comp': 7,
 'win': 51,
 'fa': 4,
 'cup': 7,
 'final': 14,
 'tkts': 4,
 '21st': 3,
 'may': 32,
 '2005': 3,
 'text': 120,
 '87121': 3,
 'receive': 24,
 'question': 20,
 'std': 5,
 'txt': 112,
 'rate': 19,
 'c': 103,
 'apply': 16,
 '08452810075over18': 2,
 'dun': 32,
 'say': 69,
 'early': 24,
 'hor': 1,
 'already': 56,
 'nah': 8,
 'think': 91,
 'usf': 5,
 'life': 44,
 'around': 33,
 'though': 14,
 'freemsg': 9,
 'hey': 57,
 'darling': 4,
 '3': 59,
 'week': 82,
 'word': 34,
 'back': 82,
 'like': 141,
 'fun': 14,
 'still': 86,
 'tb': 2,
 'xxx': 20,
 'chgs': 1,
 'send': 121,
 'å': 179,
 '1': 100,
 '50': 30,
 'rcv': 2,
 'even': 37,
 'brother': 10,
 'speak': 23,
 'treat

In [103]:
def keep_token(proccessed_token, threshold):
  if proccessed_token not in token_counter:
    return False
  else:
    return token_counter[proccessed_token] > threshold

keep_token('pls', 10)

True

In [104]:
features = set()

for token in token_counter:
  if keep_token(token, 150):
    features.add(token)

features

{'2',
 '4',
 'call',
 'day',
 'free',
 'get',
 'go',
 'gt',
 'know',
 'lt',
 'ok',
 'u',
 'ur',
 'å'}

In [105]:
features = list(features)
features

['4',
 'call',
 'free',
 '2',
 'get',
 'lt',
 'day',
 'gt',
 'ok',
 'ur',
 'know',
 'å',
 'go',
 'u']

In [106]:
token_to_index_mapping = {t:i for t, i in zip(features, range(len(features)))}
token_to_index_mapping 

{'4': 0,
 'call': 1,
 'free': 2,
 '2': 3,
 'get': 4,
 'lt': 5,
 'day': 6,
 'gt': 7,
 'ok': 8,
 'ur': 9,
 'know': 10,
 'å': 11,
 'go': 12,
 'u': 13}

In [107]:
import numpy as np

def message_to_count_vector(message):
  count_vector = np.zeros(len(features))

  processed_list_of_tokens = message_to_token_list(message)

  for token in processed_list_of_tokens:
    if token not in features:
      continue
    index = token_to_index_mapping[token]
    count_vector[index] += 1
  
  return count_vector

message_to_count_vector('Go until jurong point, crazy.. Available only ')

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])

In [108]:
message_to_count_vector(train_df['message'].iloc[3])

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 2.])

In [109]:
train_df.iloc[3]

label                                                    ham
message    U dun say so early hor... U c already then say...
Name: 3, dtype: object

In [110]:
def df_to_X_y(dff):
  y = dff['label'].to_numpy().astype(int)

  message_col = dff['message']
  count_vectors = []

  for message in message_col:
    count_vector = message_to_count_vector(message)
    count_vectors.append(count_vector)

  X = np.array(count_vectors).astype(int)

  return X, y

In [112]:
train_df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
3115,ham,Uncle Abbey! Happy New Year. Abiola
3116,ham,Now am free call me pa.
3117,ham,R u saying i should re order the slippers cos ...
3118,ham,Stop knowing me so well!


In [115]:
train_df['label'] = train_df['label'].map({'ham': 0, 'spam': 1})
X_train, y_train = df_to_X_y(train_df)

test_df['label'] = test_df['label'].map({'ham': 0, 'spam': 1})
X_test, y_test = df_to_X_y(test_df)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3120, 14), (3120,), (780, 14), (780,))

In [116]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler().fit(X_train)

X_train, X_test = scaler.transform(X_train), scaler.transform(X_test)

X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.33333333,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.11111111],
       [0.        , 0.        , 0.33333333, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.11111111],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [117]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

lr = LogisticRegression().fit(X_train, y_train)
print(classification_report(y_test, lr.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.99      0.95       681
           1       0.84      0.41      0.55        99

    accuracy                           0.92       780
   macro avg       0.88      0.70      0.75       780
weighted avg       0.91      0.92      0.90       780

