In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
from nltk.corpus import stopwords

In [4]:
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phuongdang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [5]:
newsgroups = fetch_20newsgroups(subset='all')

In [8]:
newsgroups['data']

["From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n",
 'From: mblawson@midway.ecn.uoknor.edu (Matthew B Lawson)\nSubject: Which h

In [9]:
def preprocess_text(text):
    return ' '.join([word.lower() for word in text.split() if word.lower() not in stop_words])

In [11]:
news_data = []
for news in newsgroups['data']:
    news_data.append(preprocess_text(news))
news_data

["from: mamatha devineni ratnam <mr47+@andrew.cmu.edu> subject: pens fans reactions organization: post office, carnegie mellon, pittsburgh, pa lines: 12 nntp-posting-host: po4.andrew.cmu.edu sure bashers pens fans pretty confused lack kind posts recent pens massacre devils. actually, bit puzzled bit relieved. however, going put end non-pittsburghers' relief bit praise pens. man, killing devils worse thought. jagr showed much better regular season stats. also lot fo fun watch playoffs. bowman let jagr lot fun next couple games since pens going beat pulp jersey anyway. disappointed see islanders lose final regular season game. pens rule!!!",
 'from: mblawson@midway.ecn.uoknor.edu (matthew b lawson) subject: high-performance vlb video card? summary: seek recommendations vlb video card nntp-posting-host: midway.ecn.uoknor.edu organization: engineering computer network, university oklahoma, norman, ok, usa keywords: orchid, stealth, vlb lines: 21 brother market high-performance video card s

In [12]:
vectorizer = TfidfVectorizer(max_df=0.01, max_features=1000, ngram_range=(1, 2))
term_doc_matrix = vectorizer.fit_transform(news_data)

In [13]:
terms = vectorizer.get_feature_names_out()
terms

array(['00 00', '06', '0d', '0d 0d', '0t', '0t 0t', '120', '128', '129',
       '130', '145', '145 145', '1920', '1983', '1989', '1993apr14',
       '1993apr5', '1d9', '1d9 1d9', '1t', '24 bit', '256', '286', '2di',
       '2di 2di', '2tm', '32 bit', '34u', '3t', '415', '5u', '600', '617',
       '62', '63', '67', '68', '69', '6ei', '6um', '73', '74', '75u',
       '75u 75u', '76', '77', '78', '79', '7ey', '7u', '84', '87', '94',
       '96', '9v', '__ __', '___ ___', '____', '_____', '_o', 'a86',
       'a86 a86', 'aaron', 'absolute', 'abuse', 'academic', 'acceptable',
       'adapter', 'addresses', 'adl', 'adobe', 'agencies', 'agreed',
       'agreement', 'ah', 'aid', 'aids', 'alaska', 'alaska edu', 'ames',
       'amiga', 'amp', 'anderson', 'animals', 'ann', 'announced',
       'annual', 'anonymous ftp', 'ap', 'apart', 'apartment', 'apollo',
       'app', 'appeared', 'arabs', 'arc', 'argic', 'arizona edu', 'arm',
       'armenian', 'armenians', 'article may', 'assault', 'assumption'

In [14]:
num_components = 100

svd_model = TruncatedSVD(n_components=num_components)
svd_matrix = svd_model.fit_transform(term_doc_matrix)
