# SPAM Text Classification

- N-Gram, Bi-Gram etc
- Bag of Words (BoW)
- Term Frequency Calculation (TF)
- Inverse Document Frequency (IDF)
- Term Frequency Inverse Word Frequence (TFIDF)
- Text Classification

In [11]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
text = ['Which book is this','this is book and this is math']

In [5]:
cv = CountVectorizer()
count = cv.fit_transform(text)
count

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [7]:
count.toarray()

array([[0, 1, 1, 0, 1, 1],
       [1, 1, 2, 1, 2, 0]], dtype=int64)

In [10]:
cv.get_feature_names_out()

array(['and', 'book', 'is', 'math', 'this', 'which'], dtype=object)

## Bag of Words

In [14]:
bow_df = pd.DataFrame(count.toarray(), columns=cv.get_feature_names_out())

In [16]:
bow_df

Unnamed: 0,and,book,is,math,this,which
0,0,1,1,0,1,1
1,1,1,2,1,2,0


## Term Frequency
TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document)

In [17]:
bow_df

Unnamed: 0,and,book,is,math,this,which
0,0,1,1,0,1,1
1,1,1,2,1,2,0


In [18]:
bow_df.shape

(2, 6)

In [19]:
tf = bow_df.copy()

In [30]:
for i,j in tf.iterrows():
    print(i)
    print()

    print(j)
    print()
    print()

0

and      0
book     1
is       1
math     0
this     1
which    1
Name: 0, dtype: int64


1

and      1
book     1
is       2
math     1
this     2
which    0
Name: 1, dtype: int64




In [41]:
for index, row in enumerate(tf.iterrows()):
    print(index)
    print()
    print(row)
    print('-------')
    print(row[1])
    print('----------------------')
    print(row[1].index)
    print()
    print()

0

(0, and      0
book     1
is       1
math     0
this     1
which    1
Name: 0, dtype: int64)
-------
and      0
book     1
is       1
math     0
this     1
which    1
Name: 0, dtype: int64
----------------------
Index(['and', 'book', 'is', 'math', 'this', 'which'], dtype='object')


1

(1, and      1
book     1
is       2
math     1
this     2
which    0
Name: 1, dtype: int64)
-------
and      1
book     1
is       2
math     1
this     2
which    0
Name: 1, dtype: int64
----------------------
Index(['and', 'book', 'is', 'math', 'this', 'which'], dtype='object')




In [42]:
for index, row in enumerate(tf.iterrows()):
    for col in row[1].index:
        tf.loc[index,col] = tf.loc[index, col]/sum(row[1].values)
        
        

In [43]:
tf

Unnamed: 0,and,book,is,math,this,which
0,0.0,0.25,0.25,0.0,0.25,0.25
1,0.142857,0.142857,0.285714,0.142857,0.285714,0.0


## Inverse Document Frequency

- IDF(t) = log_e(Total number of documents / Number of documents with term t in it)
- N - Total number of rows or documents
- n - number of rows or documents in which the word is present


In [44]:
# For n - number of documents in which ithe word is present 
# we will convert into boolean a

In [45]:
bow_bol = bow_df.astype('bool')

In [46]:
bow_bol

Unnamed: 0,and,book,is,math,this,which
0,False,True,True,False,True,True
1,True,True,True,True,True,False


In [49]:
# how many this 'is' is occuring
bow_bol['is'].sum()

2

In [50]:
cols = bow_bol.columns

In [51]:
cols

Index(['and', 'book', 'is', 'math', 'this', 'which'], dtype='object')

In [52]:
non_zero =[]
for col in cols:
    non_zero.append(bow_bol[col].sum())

In [53]:
non_zero

[1, 2, 2, 1, 2, 1]

In [55]:
N = 2
idf = []
for index, col in enumerate(cols):
    idf.append(np.log((N+1)/(non_zero[index]+1))+1)

In [56]:
idf

[1.4054651081081644, 1.0, 1.0, 1.4054651081081644, 1.0, 1.4054651081081644]

## TFIDF = TF * IDF

In [57]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [58]:
text

['Which book is this', 'this is book and this is math']

In [59]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(text)

In [60]:
X

<2x6 sparse matrix of type '<class 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [64]:
print(X.toarray())

[[0.         0.44832087 0.44832087 0.         0.44832087 0.63009934]
 [0.39054766 0.27787788 0.55575576 0.39054766 0.55575576 0.        ]]


In [62]:
X_tfidf = X.toarray()

In [63]:
X_tfidf.shape

(2, 6)

In [65]:
# To get IDF
print(tfidf.idf_)

[1.40546511 1.         1.         1.40546511 1.         1.40546511]


In [67]:
# Manually Calculated
idf

[1.4054651081081644, 1.0, 1.0, 1.4054651081081644, 1.0, 1.4054651081081644]