### Import Libraries

In [1]:
import pandas as pd
import string
import re
import nltk
import numpy as np
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import _stop_words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


### Let's look at the dataset

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
spam = pd.read_csv("/content/drive/MyDrive/IS723_2024S/SMSSpamCollection.csv")
spam.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# TF-IDF using python's sklearn package

### Import Libraries

Create a function to remove punctuations, tokenize, remove stopwords and lemmatize.

In [11]:
stopwords = _stop_words.ENGLISH_STOP_WORDS
def clean(doc): # doc is a string of text
    doc = doc.replace("</br>", " ") # This text contains a lot of <br/> tags.
    doc = "".join([char.lower() for char in doc if char not in string.punctuation and not char.isdigit()])
    doc = " ".join([token for token in doc.split() if token not in stopwords])
    # remove punctuation and numbers
    return doc

Apply TF-IDF Vectorizer

1) Applying TF-IDF on unigrams using parameter - ngram_range = (1, 1) in TfidfVectorizer. **ngram_range = (minimal, maximal)**

In [13]:
# instantiates the TfidfVectorizer object, with analyzer parameter passed as our cleaning function
# ngram_range = (minimal, maximal)
tfidf_vec1 = TfidfVectorizer(preprocessor=clean,ngram_range = (1, 1))
# run the fit_transform function on our pre-processed data and store vectorized data in X_Tfidf
X_Tfidf1 = tfidf_vec1.fit_transform(spam['text'])

[('ok', 99.84362669616789),
 ('im', 99.77911331651866),
 ('ill', 74.08726454955367),
 ('just', 72.39437611370266),
 ('come', 66.34490049664204),
 ('ur', 66.29319498776388),
 ('dont', 61.780122587757305),
 ('ltgt', 61.150163230058155),
 ('know', 57.587047707708436),
 ('good', 56.82959421847413)]

In [28]:
# current text representation
df1 = pd.DataFrame(X_Tfidf1.toarray(), columns=tfidf_vec1.get_feature_names_out())
df1.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,ís,íscool,ít,íve,îa,îbraindance,îleafcutter,îsounds,îæ,úll
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# print the top 10 tokens with highest tf-idf values
tfidfwords_values = dict(df1.sum(axis = 0))
sorted(tfidfwords_values.items(), key=lambda x: x[1], reverse=True)[:10]

[('ok', 99.84362669616789),
 ('im', 99.77911331651866),
 ('ill', 74.08726454955367),
 ('just', 72.39437611370266),
 ('come', 66.34490049664204),
 ('ur', 66.29319498776388),
 ('dont', 61.780122587757305),
 ('ltgt', 61.150163230058155),
 ('know', 57.587047707708436),
 ('good', 56.82959421847413)]

2) Applying TF-IDF on bigrams using parameter - ngram_range = (2, 2) in TfidfVectorizer

In [30]:
tfidf_vec2 = TfidfVectorizer(preprocessor=clean,ngram_range = (2, 2))
X_Tfidf2 = tfidf_vec2.fit_transform(spam['text'])

In [31]:
# current text representation
df2 = pd.DataFrame(X_Tfidf2.toarray(), columns=tfidf_vec2.get_feature_names_out())
df2.head()

Unnamed: 0,aa exhaust,aah bless,aah cuddle,aah speak,aaniye pudunga,aaooooright work,aathilove lot,aathiwhere dear,ab sara,abbey happy,...,ít worry,ít worryc,íve spent,íve staring,îa compofstuff,îbraindance îa,îleafcutter john,îsounds like,îæ ud,úll forever
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# print the top 10 tokens with highest tf-idf values
tfidfwords_values = dict(df2.sum(axis = 0))
sorted(tfidfwords_values.items(), key=lambda x: x[1], reverse=True)[:10]

[('ill later', 26.867738727031625),
 ('sorry ill', 25.19325257135023),
 ('ok lor', 11.690698202527116),
 ('dont know', 9.918618007843966),
 ('let know', 9.842409414413943),
 ('im home', 9.466475380519402),
 ('good morning', 9.21958337276694),
 ('pls send', 8.284306196324547),
 ('im going', 7.742023230190854),
 ('wat doing', 7.182696311574098)]

3) Applying TF-IDF on trigram using parameter - ngram_range = (3, 3) in TFidfVectorizer

In [33]:
tfidf_vec3 = TfidfVectorizer(preprocessor=clean,ngram_range = (3, 3))
X_Tfidf3 = tfidf_vec3.fit_transform(spam['text'])

In [34]:
# current text representation
df3 = pd.DataFrame(X_Tfidf3.toarray(), columns=tfidf_vec3.get_feature_names_out())
df3.head()

Unnamed: 0,aa exhaust hanging,aah bless hows,aah cuddle lush,aah speak tomo,aaniye pudunga venaam,ab sara jorgeshock,abbey happy new,abdomen gynae infections,abeg make profit,abelu havhear itc,...,ít wanna crazy,ít worryc ltr,íve spent geva,íve staring phone,îa compofstuff aphex,îbraindance îa compofstuff,îleafcutter john îsounds,îsounds like insects,îæ ud evening,úll forever come
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
# print the top 10 tokens with highest tf-idf values
tfidfwords_values = dict(df3.sum(axis = 0))
sorted(tfidfwords_values.items(), key=lambda x: x[1], reverse=True)[:10]

[('sorry ill later', 34.026535629762634),
 ('happy new year', 6.448235500111434),
 ('pls send message', 6.182997106873429),
 ('phone right pls', 6.015722634981234),
 ('pick phone right', 6.015722634981234),
 ('right pls send', 6.015722634981234),
 ('private account statement', 5.9957195979828555),
 ('identifier code expires', 4.7805534702412045),
 ('account statement shows', 4.507412420493767),
 ('statement shows unredeemed', 4.507412420493767)]

Reference:

https://www.askpython.com/python/examples/tf-idf-model-from-scratch
