In [23]:
import pandas as pd

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

In [24]:
# load data
data = fetch_20newsgroups(subset='train')
df = pd.DataFrame(data.data, columns=['text'])
df.head()

Unnamed: 0,text
0,From: lerxst@wam.umd.edu (where's my thing)\nS...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...


In [25]:
# remove punctuation and numbers

df['text'] = df['text'].str.replace('[^\w\s]','').str.replace('\d+', '')

In [26]:
# set up a bag of words transformer

vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 1),
                             min_df=0.05)

In [27]:
# transformer finds the words to be retained

vectorizer.fit(df['text'])

In [28]:
X = vectorizer.transform(df['text'])

In [32]:
# create bago of words dataframe

bagofwords = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())
pd.set_option('display.max_columns', None)
bagofwords.head()

Unnamed: 0,10,11,12,13,14,15,16,17,18,19,1993,20,21,22,23,24,25,30,50,93,able,ac,access,actually,ago,apr,article,ask,available,away,bad,based,believe,best,better,big,bit,buy,ca,called,car,card,case,cc,center,com,come,computer,control,course,cs,data,david,day,department,dept,did,didn,different,distribution,does,doesn,doing,don,drive,edu,email,end,example,fact,far,fax,following,free,general,getting,given,god,going,good,got,gov,government,great,group,hand,hard,having,heard,help,high,home,hope,host,idea,info,information,institute,interested,internet,isn,john,just,keywords,kind,know,large,law,left,let,life,like,line,lines,list,little,ll,local,long,look,looking,lot,mail,make,makes,man,maybe,mean,means,message,na,national,need,net,new,news,newsreader,nntp,non,note,number,oh,old,opinions,order,org,organization,people,person,phone,place,point,possible,post,posting,power,pretty,probably,problem,problems,program,public,question,questions,quite,read,real,really,reason,remember,reply,research,right,run,said,say,says,science,second,seen,send,set,small,software,space,start,state,stuff,subject,support,sure,systems,technology,tell,thanks,thing,things,think,thought,time,times,today,true,try,trying,uk,university,usa,use,used,using,uucp,ve,version,want,way,windows,won,work,works,world,writes,wrong,wrote,year,years,yes
0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,5,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,2,0,0,0,0,0,1,2,0,0,2,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,3,0,0,0,0,0,0,2,0,0,0,0,0,0,2,0,0,1,0,0,0,0,1,1,1,1,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,2,0,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,2,0,0,1,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,2,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,1


In [31]:
bagofwords.shape

(11314, 227)

In [33]:
# with n grams
vectorizer = CountVectorizer(lowercase=True,
                             stop_words='english',
                             ngram_range=(1, 2),
                             min_df=0.1)

In [34]:
vectorizer.fit(df['text'])

In [35]:
X = vectorizer.transform(df['text'])

In [36]:
bagofwords = pd.DataFrame(X.toarray(),
                          columns = vectorizer.get_feature_names_out())

bagofwords.head()

Unnamed: 0,10,article,believe,better,ca,case,com,computer,cs,did,distribution,does,doesn,don,edu,going,good,got,help,host,just,know,let,like,lines,ll,long,mail,make,need,new,news,nntp,nntp posting,organization,organization university,people,point,posting,posting host,problem,question,read,really,reply,right,said,say,state,subject,sure,thanks,thing,things,think,time,university,usa,use,used,using,ve,want,way,work,world,writes,writes article,years
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,2,0,0,1,2,0,2,2,0,1,1,0,0,2,1,0,1,1,1,0,0,1,0,1,1,0,0,1,0,1,0,0,0,0,1,0,2,0,0,0,0,0,1,0,1,0,0,0,1,1,1,1,0,0,1,0,1,0,0,0,0,0
3,0,1,0,0,0,0,2,2,0,0,1,0,0,0,2,0,0,1,0,1,1,1,0,1,1,0,0,0,0,0,0,0,1,1,1,0,0,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0
4,0,2,0,0,0,0,2,0,1,0,1,0,0,1,3,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,2,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,2,1,0,0


In [37]:
vectorizer.get_feature_names_out()

array(['10', 'article', 'believe', 'better', 'ca', 'case', 'com',
       'computer', 'cs', 'did', 'distribution', 'does', 'doesn', 'don',
       'edu', 'going', 'good', 'got', 'help', 'host', 'just', 'know',
       'let', 'like', 'lines', 'll', 'long', 'mail', 'make', 'need',
       'new', 'news', 'nntp', 'nntp posting', 'organization',
       'organization university', 'people', 'point', 'posting',
       'posting host', 'problem', 'question', 'read', 'really', 'reply',
       'right', 'said', 'say', 'state', 'subject', 'sure', 'thanks',
       'thing', 'things', 'think', 'time', 'university', 'usa', 'use',
       'used', 'using', 've', 'want', 'way', 'work', 'world', 'writes',
       'writes article', 'years'], dtype=object)