# Vectorisation

I decided to split the files being that some steps that where covered to understand the under workings were more pythonically done in here, i.e. the use of lambda to apply tokenisation was done by a function the sklearn library
Taking the tokenised stemmed or lemmatised text row by row or say line by line and counting eah word occurence as they appear all through the document, counting each word per cell.
This words form columns and are uniquely occuring through the dataset

In [1]:
import pandas as pd
import nltk
import string
import re
import csv


In [2]:
# read dataset in
df = pd.read_csv('SMSSpamCollection.tsv', sep='\t', header=None)

# set headers
df.columns = ['label', 'body_text']
df.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Load punctuations, stopwords and parser the stemming function into an object

In [3]:
pd.set_option('display.max_colwidth', 100)
punct = string.punctuation
stop_words = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
print(punct, "\n", stop_words)


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~ 
 ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'on

### Create function to remove punctuations, tokenise text, remove stopwords and preform stemming

In [4]:
def cleaning(text):
    text = "".join([word.lower() for word in text if word not in punct])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stop_words]
    return text

### Apply count vectorise and fit and transform it on the body text

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

# store the vectoriser in an object ( this is called instantiation of object)
count_vect = CountVectorizer(analyzer=cleaning)

# fit transform data to document term matrix
X_counts = count_vect.fit_transform(df['body_text'])

print(type(X_counts))
print(X_counts.shape)
print(count_vect.get_feature_names_out())

<class 'scipy.sparse._csr.csr_matrix'>
(5568, 8107)
['' '0' '008704050406' ... 'ü' 'üll' '〨ud']


In [6]:
# get smaller data
df_sample = df[0:20] 
df_sample.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [7]:
# instantiate vectoriser into an object
count_vect_sample = CountVectorizer(analyzer=cleaning)

# fit and transform by applying vectoriser to the stemmed text to make document term matrix
X_counts_sample = count_vect_sample.fit_transform(df_sample['body_text'])

print(X_counts_sample.shape)
print(count_vect_sample.get_feature_names_out())


(20, 201)
['08002986030' '08452810075over18' '09061701461' '1' '100' '100000' '11'
 '12' '150pday' '16' '2' '20000' '2005' '21st' '3' '4' '4403ldnw1a7rw18'
 '4txtú120' '6day' '81010' '87077' '87121' '87575' '9' '900' 'aft' 'aid'
 'alreadi' 'anymor' 'appli' 'ard' 'around' 'b' 'bless' 'breather'
 'brother' 'call' 'caller' 'callertun' 'camera' 'cash' 'chanc' 'claim'
 'click' 'co' 'code' 'colour' 'comin' 'comp' 'copi' 'cost' 'credit' 'cri'
 'csh11' 'cup' 'custom' 'da' 'date' 'dont' 'eg' 'eh' 'england' 'enough'
 'entitl' 'entri' 'even' 'fa' 'feel' 'final' 'fine' 'finish' 'first'
 'free' 'friend' 'fulfil' 'go' 'goalsteam' 'goe' 'gonna' 'gota' 'grant'
 'ha' 'help' 'hl' 'home' 'hour' 'httpwap' 'im' 'info' 'ive' 'jackpot'
 'joke' 'k' 'kim' 'kl341' 'lar' 'latest' 'lccltd' 'like' 'link' 'live'
 'lor' 'lunch' 'macedonia' 'make' 'may' 'mell' 'membership' 'messag'
 'minnaminungint' 'miss' 'mobil' 'month' 'nah' 'name' 'nation' 'naughti'
 'network' 'news' 'next' 'nurungu' 'oh' 'oru' 'patent' 'pay' 'pe

#### When you have a matrix or data with most output of Zeros it is reffered to as sparse matrrices. It will be inefficient to store all the 0 values, instead we store the location and values of non-zero elements

In [8]:
# to view array we have to expand the matrix into a dataframe
X_count_df = pd.DataFrame(X_counts_sample.toarray())

X_count_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,200
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X_count_df.columns = count_vect_sample.get_feature_names_out()
X_count_df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,winner,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


## N-GRAMs

#### Definition
create document term matrix still with the count, but this time instead of returning single words in columns, they represent combinations of adjacent words defined by length n in the body-text

In [10]:
df.head()

Unnamed: 0,label,body_text
0,ham,I've been searching for the right words to thank you for this breather. I promise i wont take yo...
1,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
2,ham,"Nah I don't think he goes to usf, he lives around here though"
3,ham,Even my brother is not like to speak with me. They treat me like aids patent.
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


In [17]:

# create funtion to remove punctuation, change to lowercase, tokens words by spliting them with re, remove stop words and preform steming or lematisation
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in punct])
    tokens = re.split('\W+', text)
    text = " ".join([ps.stem(word) for word in text if word not in stop_words])
    return tokens


data_ngrams = df['body_text'].apply(lambda x: clean_text(x))
data_ngrams.head()

0    [ive, been, searching, for, the, right, words, to, thank, you, for, this, breather, i, promise, ...
1    [free, entry, in, 2, a, wkly, comp, to, win, fa, cup, final, tkts, 21st, may, 2005, text, fa, to...
2                              [nah, i, dont, think, he, goes, to, usf, he, lives, around, here, though]
3           [even, my, brother, is, not, like, to, speak, with, me, they, treat, me, like, aids, patent]
4                                                             [i, have, a, date, on, sunday, with, will]
Name: body_text, dtype: object