In [21]:
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
# load data
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [23]:
# remove punctuation and numbers
df['text'] = df['text'].str.replace('[^\w\s]','').str.replace('\d+', '')

In [24]:
# set up Tf-idf transformer
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 1),
                             min_df=0.05)

In [25]:
# learn words to be retained and their frequency

vectorizer.fit(df['text'])

In [26]:
X = vectorizer.transform(df['text'])

In [27]:
# create tf-idf dataframe

tfidf = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

tfidf.head()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,...,won,work,works,world,writes,wrong,wrote,year,years,yes
0,0.0,0.0,0.0,0.0,0.0,0.140625,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.134621,0.0
1,0.0,0.327538,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.18588,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.147961,0.089465,0.0,0.194042,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.338842,0.10244,0.0,0.0,0.0,0.0,0.224959


In [28]:
tfidf.shape

(11314, 227)

In [29]:
# with n grams
vectorizer = TfidfVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 2),
                             min_df=0.1)

In [30]:
vectorizer.fit(df['text'])

In [31]:
X = vectorizer.transform(df['text'])

In [32]:
tfidf = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

tfidf.head()

Unnamed: 0,10,article,believe,better,ca,case,com,computer,cs,did,...,used,using,ve,want,way,work,world,writes,writes article,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.329186
1,0.0,0.224645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.189469,0.0,0.0,0.0,0.352761,0.0,0.0,...,0.0,0.0,0.170267,0.0,0.159581,0.0,0.0,0.0,0.0,0.0
3,0.0,0.142824,0.0,0.0,0.0,0.0,0.301176,0.485643,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.224518,0.135754,0.265315,0.0
4,0.0,0.256197,0.0,0.0,0.0,0.0,0.270123,0.0,0.221776,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.402738,0.121757,0.0,0.0


In [33]:
vectorizer.get_feature_names_out()

array(['10', 'article', 'believe', 'better', 'ca', 'case', 'com',
       'computer', 'cs', 'did', 'distribution', 'does', 'doesn', 'don',
       'edu', 'going', 'good', 'got', 'help', 'host', 'just', 'know',
       'let', 'like', 'lines', 'll', 'long', 'mail', 'make', 'need',
       'new', 'news', 'nntp', 'nntp posting', 'organization',
       'organization university', 'people', 'point', 'posting',
       'posting host', 'problem', 'question', 'read', 'really', 'reply',
       'right', 'said', 'say', 'state', 'subject', 'sure', 'thanks',
       'thing', 'things', 'think', 'time', 'university', 'usa', 'use',
       'used', 'using', 've', 'want', 'way', 'work', 'world', 'writes',
       'writes article', 'years'], dtype=object)