In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
?CountVectorizer

In [None]:
corpus = [
    'This is the first document',
    'This document is the second document',
    'And this is the third one.',
    'Is this the first document?'
]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)

In [None]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [None]:
print (X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [None]:
import pandas as pd
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


Each text string in the corpus is represented with the words present by ignoring the word order. This representation is called 'Bag of word' (BoW) representation.

In [None]:
# Experiment: Instantiate a CountVectorizer with 'lowercase=False' and observe the document-term matrix.

vectorizer = CountVectorizer(lowercase=False)
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print (f'Number of features: {len(vectorizer.get_feature_names_out())}')

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Number of features: 11


Unnamed: 0,And,Is,This,document,first,is,one,second,the,third,this
0,0,0,1,1,1,1,0,0,1,0,0
1,0,0,1,2,0,1,0,1,1,0,0
2,1,0,0,0,0,1,1,0,1,1,1
3,0,1,0,1,1,0,0,0,1,0,1


In [None]:
vectorizer = CountVectorizer(lowercase=True, token_pattern=r'[a-zA-Z]+')
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print (f'Number of features: {len(vectorizer.get_feature_names_out())}')

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Number of features: 9


Unnamed: 0,and,document,first,is,one,second,the,third,this
0,0,1,1,1,0,0,1,0,1
1,0,2,0,1,0,1,1,0,1
2,1,0,0,1,1,0,1,1,1
3,0,1,1,1,0,0,1,0,1


## Remove stop-words

In [None]:
vectorizer = CountVectorizer(
    lowercase=True,
    token_pattern=r'[a-zA-Z]+',
    stop_words='english')

X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print (f'Number of features: {len(vectorizer.get_feature_names_out())}')

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Number of features: 2


Unnamed: 0,document,second
0,1,0
1,2,1
2,0,0
3,1,0


In [None]:
corpus = [
    'This is the first document',
    'This document is the second document',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = CountVectorizer(
    lowercase=True,
    ngram_range=(1,2), # consider only bi-grams
    token_pattern=r'[a-zA-Z]+',
    # stop_words='english'
)

X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print (f'Number of features: {len(vectorizer.get_feature_names_out())}')

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Number of features: 22


Unnamed: 0,and,and this,document,document is,first,first document,is,is the,is this,one,...,the,the first,the second,the third,third,third one,this,this document,this is,this the
0,0,0,1,0,1,1,1,1,0,0,...,1,1,0,0,0,0,1,0,1,0
1,0,0,2,1,0,0,1,1,0,0,...,1,0,1,0,0,0,1,1,0,0
2,1,1,0,0,0,0,1,1,0,1,...,1,0,0,1,1,1,1,0,1,0
3,0,0,1,0,1,1,1,0,1,0,...,1,1,0,0,0,0,1,0,0,1


# TfIdfVectorizer

* Tf --> Term frequency
* Idf --> Inverse document frequency

TfIdf(term) = tf * idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document',
    'This document is the second document',
    'And this is the third one.',
    'Is this the first document?'
]

vectorizer = TfidfVectorizer(smooth_idf=True, norm='l2', ngram_range=(1, 2)) # this line is changed.

X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

print (f'Number of features: {len(vectorizer.get_feature_names_out())}')

df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
df

Number of features: 22


Unnamed: 0,and,and this,document,document is,first,first document,is,is the,is this,one,...,the,the first,the second,the third,third,third one,this,this document,this is,this the
0,0.0,0.0,0.314532,0.0,0.38851,0.38851,0.257151,0.314532,0.0,0.0,...,0.257151,0.38851,0.0,0.0,0.0,0.0,0.257151,0.0,0.38851,0.0
1,0.0,0.0,0.455513,0.356824,0.0,0.0,0.186206,0.227756,0.0,0.0,...,0.186206,0.0,0.356824,0.0,0.0,0.0,0.186206,0.356824,0.0,0.0
2,0.357007,0.357007,0.0,0.0,0.0,0.0,0.186301,0.227873,0.0,0.357007,...,0.186301,0.0,0.0,0.357007,0.357007,0.357007,0.186301,0.0,0.281469,0.0
3,0.0,0.0,0.28294,0.0,0.349487,0.349487,0.231322,0.0,0.443279,0.0,...,0.231322,0.349487,0.0,0.0,0.0,0.0,0.231322,0.0,0.0,0.443279


Term frequency (TF)

Document #1: this: 1, is: 1, the: 1, first:1 document: 1

Inverse document frequency (IDF): log(n/df(t)) + 1

n = 4

this | 4 |
is   | 4
the  | 4
first | 2
document | 3




In [None]:
import numpy as np

num_docs = 4
idf_stats = {'this': 4, 'is': 4, 'the': 4, 'first': 2, 'document': 3}
tf_stats = {'this': 1, 'is': 1, 'the': 1, 'first': 1, 'document': 1}

print ("Without smoothing: ")
for key in tf_stats:
  idf = np.log(4/idf_stats[key]) + 1
  tf = tf_stats[key]
  print(key, idf*tf)

print ("\n\nWith smoothing: ")
for key in tf_stats:
  idf = np.log((4+1)/(idf_stats[key]+1)) + 1
  tf = tf_stats[key]
  print(key, idf*tf)

Without smoothing: 
this 1.0
is 1.0
the 1.0
first 1.6931471805599454
document 1.2876820724517808


With smoothing: 
this 1.0
is 1.0
the 1.0
first 1.5108256237659907
document 1.2231435513142097


In [None]:
?TfidfVectorizer

# Load text data with CSV file

In [None]:
!wget https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip

--2025-03-08 11:10:25--  https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘sms+spam+collection.zip’

sms+spam+collection     [ <=>                ] 198.65K  --.-KB/s    in 0.1s    

2025-03-08 11:10:25 (1.82 MB/s) - ‘sms+spam+collection.zip’ saved [203415]



In [None]:
!unzip sms+spam+collection.zip

Archive:  sms+spam+collection.zip
  inflating: SMSSpamCollection       
  inflating: readme                  


In [None]:
sms_data = pd.read_csv(
    'SMSSpamCollection',
    sep='\t',
    header=None,
    names=['label', 'text'])
sms_data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
