In [None]:
from sklearn.datasets import fetch_20newsgroups

cats = ['alt.atheism', 'sci.space', 'rec.sport.hockey', 'rec.autos', 'talk.politics.guns']
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)


array([1, 3, 3, 1, 0, 0, 4, 4, 3, 2])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
vectorizer = TfidfVectorizer(max_df = 0.5, min_df = 20, stop_words = 'english')

In [None]:
X_tfidf = vectorizer.fit_transform(newsgroups_train.data)

In [None]:
X_tfidf.shape

(2813, 3209)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters = 5, max_iter = 100, n_init = 123)

In [None]:
kmeans.fit(X_tfidf)

KMeans(max_iter=100, n_clusters=5, n_init=123)

In [None]:
kmeans.labels_

array([4, 4, 0, ..., 1, 2, 2], dtype=int32)

In [None]:
newsgroups_train.target

array([1, 3, 3, ..., 4, 2, 2])

In [None]:
import pandas as pd
data = pd.DataFrame([newsgroups_train.data, newsgroups_train.target]).T

In [None]:
data.columns = ['news', 'target']

In [None]:
data['pred'] = kmeans.labels_ 

In [None]:
data

Unnamed: 0,news,target,pred
0,From: mchaffee@dcl-nxt07 (Michael T Chaffee)\n...,1,4
1,From: dietz@cs.rochester.edu (Paul Dietz)\nSub...,3,4
2,From: hathaway@stsci.edu\nSubject: Re: Vandali...,3,0
3,From: keys@starchild.ncsl.nist.gov (Lawrence B...,1,4
4,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...,0,4
...,...,...,...
2808,From: raman@translab.its.uci.edu (Balaji V. Ra...,1,4
2809,From: jbrown@batman.bmd.trw.com\nSubject: Re: ...,0,4
2810,From: cdt@sw.stratus.com (C. D. Tavares)\nSubj...,4,1
2811,From: Young-Soo Che <yc25+@andrew.cmu.edu>\nSu...,2,2


In [None]:
cats = ['alt.atheism', 'sci.space', 'rec.sport.hockey', 'rec.autos', 'talk.politics.guns']


In [None]:
data['target'].replace({0:'atheism',1 : 'auto',2:'hockey', 3: 'space',4:'guns'}, inplace = True)

In [None]:
data['target']

0          auto
1         space
2         space
3          auto
4       atheism
         ...   
2808       auto
2809    atheism
2810       guns
2811     hockey
2812     hockey
Name: target, Length: 2813, dtype: object

In [None]:
from collections import Counter
for cluster_no in range(0,5):
  
  content = data[data['pred']==cluster_no].news.values
  print(cluster_no, "&&&&&&&&&&&&******************&&&&&&&&&&&&&&&&&&&&&&")
  vectorizer = CountVectorizer(max_df = 0.5, min_df = 20, stop_words = 'english')
  tfidf = vectorizer.fit_transform(content)
  print(vectorizer.get_feature_names())
  df = pd.DataFrame(tfidf.toarray(),columns = vectorizer.get_feature_names())
  print(df.sum().sort_values(ascending = False).index[:25])

0 &&&&&&&&&&&&******************&&&&&&&&&&&&&&&&&&&&&&
['000', '04', '10', '100', '11', '12', '13', '14', '15', '16', '17', '18', '19', '1993', '1993apr20', '1993apr21', '20', '21', '22', '23', '24', '25', '26', '28', '30', '300', '32', '3539', '3684', '40', '41', '50', '525', '575', '60', '91109', '93', '__', '___', '____', '_____', 'able', 'acad3', 'access', 'activities', 'actually', 'adams', 'add', 'added', 'address', 'aerospace', 'age', 'agency', 'ago', 'air', 'alaska', 'allen', 'allow', 'altitude', 'american', 'ames', 'answer', 'apollo', 'apr', 'april', 'area', 'areas', 'asked', 'assume', 'astro', 'astronaut', 'astronomy', 'atmosphere', 'attempt', 'au', 'aurora', 'available', 'away', 'aws', 'baalke', 'base', 'based', 'basic', 'basically', 'beam', 'believe', 'better', 'big', 'billion', 'bit', 'bitnet', 'black', 'body', 'book', 'box', 'budget', 'build', 'building', 'built', 'ca', 'california', 'called', 'care', 'carry', 'case', 'center', 'change', 'cheaper', 'close', 'cmu', 'college

In [None]:
data['pred'].replace({3:'atheism',4 : 'auto',2:'hockey', 0: 'space',1:'guns'}, inplace = True)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
confusion_matrix(data['target'],data['pred'])

array([[115, 360,   4,   0,   1],
       [  2, 586,   2,   2,   2],
       [  2, 100, 439,   0,   5],
       [  0,  52,   0, 547,   1],
       [  2, 140,   2,   4, 445]])

In [None]:
accuracy_score(data['target'],data['pred'])

0.7579097049413438

In [None]:
import gensim

In [None]:
import nltk
stp_words = nltk.corpus.stopwords.words('english')
def normalize_corpus(news):
  norm_news = []
  for news_ in news:
    news_ = news_.lower()
    news_tokens = [da.strip() for da in news_.split(' ')]
    news_tokens = [token for token in news_tokens if token not in stp_words]
    if news_tokens:
      norm_news.append(news_tokens)
  return norm_news
  

In [None]:
news___ = list(data['news'].values)
norm_news = normalize_corpus(news___)

In [None]:
norm_news

[['from:',
  'mchaffee@dcl-nxt07',
  '(michael',
  'chaffee)\nsubject:',
  're:',
  'chryslers',
  'compact',
  'lh',
  'sedans?\norganization:',
  'university',
  'illinois',
  'urbana\nlines:',
  '33\n\ncka52397@uxa.cso.uiuc.edu',
  '(carolinafan@uiuc)',
  'writes:\n\n>shoppa@almach.caltech.edu',
  '(tim',
  'shoppa)',
  'writes:\n\n>>i',
  'thought',
  'v-10',
  'originally',
  'designed',
  'truck',
  '(not',
  'necessarily\n>>a',
  'pickup!)',
  'sort',
  'dropped',
  "viper's",
  'frame',
  'because\n>>it',
  'fit',
  'available.',
  '',
  'friend',
  'mine',
  'saw',
  '(and',
  'heard)',
  'viper,\n>>and',
  "friend's",
  'first',
  'response',
  'sounded',
  'like',
  'truck!',
  '',
  'sounded\n>>fine',
  'me,',
  'again,',
  'like',
  'whiny',
  'noise',
  'modern\n>>sports',
  'car',
  'engines',
  'make.',
  '',
  'btw,',
  'viper',
  'saw',
  'moving',
  '10mph,\n>>just',
  'like',
  'cars',
  '10',
  'freeway',
  'heading',
  'east',
  'la\n>>on',
  'friday',
  'afternoo

In [None]:
ngram = gensim.models.Phrases(norm_news, min_count= 100)
ngram_model = gensim.models.phrases.Phraser(ngram)

In [None]:
ngram_model[norm_news[0]]

['from:',
 'mchaffee@dcl-nxt07',
 '(michael',
 'chaffee)\nsubject:',
 're:',
 'chryslers',
 'compact',
 'lh',
 'sedans?\norganization:',
 'university',
 'illinois',
 'urbana\nlines:',
 '33\n\ncka52397@uxa.cso.uiuc.edu',
 '(carolinafan@uiuc)',
 'writes:\n\n>shoppa@almach.caltech.edu',
 '(tim',
 'shoppa)',
 'writes:\n\n>>i',
 'thought',
 'v-10',
 'originally',
 'designed',
 'truck',
 '(not',
 'necessarily\n>>a',
 'pickup!)',
 'sort',
 'dropped',
 "viper's",
 'frame',
 'because\n>>it',
 'fit',
 'available.',
 'friend',
 'mine',
 'saw',
 '(and',
 'heard)',
 'viper,\n>>and',
 "friend's",
 'first',
 'response',
 'sounded',
 'like',
 'truck!',
 'sounded\n>>fine',
 'me,',
 'again,',
 'like',
 'whiny',
 'noise',
 'modern\n>>sports',
 'car',
 'engines',
 'make.',
 'btw,',
 'viper',
 'saw',
 'moving',
 '10mph,\n>>just',
 'like',
 'cars',
 '10',
 'freeway',
 'heading',
 'east',
 'la\n>>on',
 'friday',
 'afternoon.',
 'looked',
 'really',
 'nice,',
 'though.\n\n>\tactually,',
 'impression',
 'v-10'

In [None]:
norm_corpus_gram = [ngram_model[doc] for doc in norm_news]

In [None]:
dictionary = gensim.corpora.Dictionary(norm_corpus_gram)

In [None]:
len(dictionary)

121543

In [None]:
dictionary.filter_extremes(no_below = 20, no_above=0.6)

In [None]:
len(dictionary)

2766

In [None]:
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_gram]

In [None]:
lda_model = gensim.models.LdaModel(corpus = bow_corpus, id2word = dictionary, num_topics=5)



In [1]:
lda_model.print_topics(5,20)

NameError: ignored

In [5]:
from __future__ import print_function
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

  from collections import Iterable
  from collections import Mapping


In [3]:
!pip install pyLDAvis

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 4.1 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Collecting sklearn
  Downloading sklearn-0.0.post1.tar.gz (3.6 kB)
Building wheels for collected packages: pyLDAvis, sklearn
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=f90395b9a98693289413345610b4a6d1748989b15458cf2b4341c91515ef5065
  Stored in directory: /root/.cache/pip/wheels/90/61/ec/9dbe9efc3acf9c4e37ba70fbbcc3f3a0ebd121060aa593181a
  Building wheel for sklearn (

In [6]:
newsgroups = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))
docs_raw = newsgroups.data
print(len(docs_raw))


11314


In [7]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)



In [8]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)
# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)

In [9]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)


  default_term_info = default_term_info.sort_values(
