In [6]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.text import TextCollection
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/hisl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/hisl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
corpus = [
    "TF-IDF is a technique for extracting features from text.",
    "It is commonly used in natural language processing tasks.",
    "NLTK provides tools for text processing and analysis."
]

# Tokenize and remove stopwords
stop_words = set(stopwords.words('english'))
tokenized_corpus = [
    [word.lower() for word in word_tokenize(doc) if word.isalpha() and word.lower() not in stop_words]
    for doc in corpus
]


In [8]:
# Create TextCollection object
text_collection = TextCollection(tokenized_corpus)

# Calculate TF-IDF for each term in each document
tfidf_scores = []
for doc in tokenized_corpus:
    doc_scores = []
    for term in doc:
        tfidf = text_collection.tf_idf(term, doc)
        doc_scores.append((term, tfidf))
    tfidf_scores.append(doc_scores)

# Create a DataFrame to display the results
df = pd.DataFrame(tfidf_scores[0], columns=['Term', 'TF-IDF'])
df = df.sort_values('TF-IDF', ascending=False).reset_index(drop=True)
print(df)


         Term    TF-IDF
0   technique  0.274653
1  extracting  0.274653
2    features  0.274653
3        text  0.101366


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(cosine_sim)


[[1.         0.07281691 0.16026805]
 [0.07281691 1.         0.0780356 ]
 [0.16026805 0.0780356  1.        ]]


In [11]:
corpus

['TF-IDF is a technique for extracting features from text.',
 'It is commonly used in natural language processing tasks.',
 'NLTK provides tools for text processing and analysis.']

## Create TF-IDF And NGrams

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [14]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [15]:
X

array([[0, 0, 0, 0.36, 0.36, 0.273, 0.36, 0.36, 0, 0.273, 0, 0, 0, 0, 0, 0, 0, 0.36, 0.273, 0.36, 0, 0],
       [0, 0, 0.35, 0, 0, 0, 0, 0, 0.35, 0.266, 0.35, 0.35, 0.35, 0, 0.266, 0, 0.35, 0, 0, 0, 0, 0.35],
       [0.385, 0.385, 0, 0, 0, 0.293, 0, 0, 0, 0, 0, 0, 0, 0.385, 0.293, 0.385, 0, 0, 0.293, 0, 0.385, 0]])

### N-Grams

In [16]:
tfidf=TfidfVectorizer(max_features=100,ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [17]:
tfidf.vocabulary_

{'tf idf': np.int64(20),
 'idf is': np.int64(7),
 'is technique': np.int64(10),
 'technique for': np.int64(18),
 'for extracting': np.int64(4),
 'extracting features': np.int64(2),
 'features from': np.int64(3),
 'from text': np.int64(6),
 'it is': np.int64(11),
 'is commonly': np.int64(9),
 'commonly used': np.int64(1),
 'used in': np.int64(22),
 'in natural': np.int64(8),
 'natural language': np.int64(13),
 'language processing': np.int64(12),
 'processing tasks': np.int64(16),
 'nltk provides': np.int64(14),
 'provides tools': np.int64(17),
 'tools for': np.int64(21),
 'for text': np.int64(5),
 'text processing': np.int64(19),
 'processing and': np.int64(15),
 'and analysis': np.int64(0)}

In [18]:
X

array([[0, 0, 0.354, 0.354, 0.354, 0, 0.354, 0.354, 0, 0, 0.354, 0, 0, 0, 0, 0, 0, 0, 0.354, 0, 0.354, 0, 0],
       [0, 0.354, 0, 0, 0, 0, 0, 0, 0.354, 0.354, 0, 0.354, 0.354, 0.354, 0, 0, 0.354, 0, 0, 0, 0, 0, 0.354],
       [0.378, 0, 0, 0, 0, 0.378, 0, 0, 0, 0, 0, 0, 0, 0, 0.378, 0.378, 0, 0.378, 0, 0.378, 0, 0.378, 0]])