<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          Count Vectorization</p>

<p style="text-align: justify; text-justify: inter-word;">
   <font size=3>
       Count vectorization create a document-term matrix where the entry of each cell will be a count of the number of time that word occurred in that document.
       sklearn count vectorization documentations: <a href="https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html">sklearn.feature_extraction.text.CountVectorizer.html</a>
   </font>
</p>


### Importing Packages

In [1]:
import pandas as pd
import nltk
import re
from typing import List
from sklearn.feature_extraction.text import CountVectorizer
import string

### Reading Raw Data

In [2]:
ps = nltk.PorterStemmer()

nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("english")

data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", delimiter="\t", header=None)
data_df.columns = ["labels", "body_text"]
data_df.head()

Unnamed: 0,labels,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Cleaning Data

In [3]:
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    stemmed_tokens = [ps.stem(word) for word in tokenzied_text if word not in stopwords]
    return stemmed_tokens

### Count Vectorization


In [13]:
# For anlayzer parameter we are passing callable called "clean_data"
# If a callable is passed it is used to extract the sequence of features out of the raw, unprocessed input.
count_vect = CountVectorizer(analyzer=clean_data)
count_vect

In [5]:
# Fitting and transforming data
x_counts = count_vect.fit_transform(data_df["body_text"])

In [6]:
# To get shape
x_counts.shape

(5568, 8106)

In [7]:
count_vect.get_feature_names_out()

array(['0', '008704050406', '0089mi', ..., 'ü', 'üll', '〨ud'],
      dtype=object)

#### Applying on smaller data

In [8]:
small_data = data_df["body_text"][:20]
x_smaller_data = count_vect.fit_transform(small_data)

In [9]:
x_smaller_data.shape

(20, 201)

In [10]:
count_vect.get_feature_names_out()

array(['08002986030', '08452810075over18', '09061701461', '1', '100',
       '100000', '11', '12', '150pday', '16', '2', '20000', '2005',
       '21st', '3', '4', '4403ldnw1a7rw18', '4txtú120', '6day', '81010',
       '87077', '87121', '87575', '9', '900', 'aft', 'aid', 'alreadi',
       'anymor', 'appli', 'ard', 'around', 'b', 'bless', 'breather',
       'brother', 'call', 'caller', 'callertun', 'camera', 'cash',
       'chanc', 'claim', 'click', 'co', 'code', 'colour', 'comin', 'comp',
       'copi', 'cost', 'credit', 'cri', 'csh11', 'cup', 'custom', 'da',
       'date', 'dont', 'eg', 'eh', 'england', 'enough', 'entitl', 'entri',
       'even', 'fa', 'feel', 'final', 'fine', 'finish', 'first', 'free',
       'friend', 'fulfil', 'go', 'goalsteam', 'goe', 'gonna', 'gota',
       'grant', 'ha', 'help', 'hl', 'home', 'hour', 'httpwap', 'im',
       'info', 'ive', 'jackpot', 'joke', 'k', 'kim', 'kl341', 'lar',
       'latest', 'lccltd', 'like', 'link', 'live', 'lor', 'lunch',
       'mace

In [11]:
# In sparse matrix most entries are 0. In the intrest of efficient storage,
# a sparse matrix will be stored by only storing the locations of the non-zero elements
x_smaller_data

<20x201 sparse matrix of type '<class 'numpy.int64'>'
	with 228 stored elements in Compressed Sparse Row format>

In [12]:
df = pd.DataFrame(x_smaller_data.toarray())
df.columns = count_vect.get_feature_names_out()
df

Unnamed: 0,08002986030,08452810075over18,09061701461,1,100,100000,11,12,150pday,16,...,winner,wkli,wonder,wont,word,wwwdbuknet,xxxmobilemovieclub,xxxmobilemovieclubcomnqjkgighjjgcbl,ye,ü
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
