In [1]:
from sklearn.datasets import fetch_20newsgroups
from pprint import pprint
import pandas as pd

In [2]:
dataset = fetch_20newsgroups(subset='all', shuffle=False, random_state=32, remove=('headers', 'footers', 'quotes'))
dataset.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
## Get data  from dictionary and list of target names
#dataset['data']
pprint(dataset.target_names)

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [6]:
len(dataset.target_names)

20

In [7]:

dataset.target.shape

(18846,)

In [8]:
df = pd.DataFrame({'News':dataset.data, 'Label': dataset.target})
df.head()

Unnamed: 0,News,Label
0,\nmorgan and guzman will have era's 1 run high...,9
1,"Well, I just got my Centris 610 yesterday. It...",4
2,Archive-name: cryptography-faq/part10\nLast-mo...,11
3,To the best of my knowledge there aren't any p...,4
4,\n\nI think that domestication will change beh...,0


In [9]:
#Assign Label names based on target names in dataset
df['Label_name'] = df['Label'].apply(lambda x: dataset.target_names[x])
df.head()

Unnamed: 0,News,Label,Label_name
0,\nmorgan and guzman will have era's 1 run high...,9,rec.sport.baseball
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt
3,To the best of my knowledge there aren't any p...,4,comp.sys.mac.hardware
4,\n\nI think that domestication will change beh...,0,alt.atheism


#### Data preprocessing

In [15]:
#pip install -U gensim
#import required libraries to ease data cleaning
from gensim.utils import tokenize
from gensim.parsing.preprocessing import preprocess_string,strip_tags, strip_punctuation, strip_numeric, remove_stopwords, strip_short
from gensim.corpora.dictionary import Dictionary
from gensim import models

In [14]:
#help(preprocess_string) # to understand what cleaning dunctions are available
df['Clean_news'] = df['News'].apply(preprocess_string)
df.head()

Unnamed: 0,News,Label,Label_name,Clean_news
0,\nmorgan and guzman will have era's 1 run high...,9,rec.sport.baseball,"[morgan, guzman, era, run, higher, year, cub, ..."
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware,"[got, centri, yesterdai, took, week, place, or..."
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt,"[archiv, cryptographi, faq, modifi, faq, sci, ..."
3,To the best of my knowledge there aren't any p...,4,comp.sys.mac.hardware,"[best, knowledg, aren, problem, quadra, blind,..."
4,\n\nI think that domestication will change beh...,0,alt.atheism,"[think, domest, chang, behavior, larg, degre, ..."


In [16]:
filters = [lambda x:x.lower(), strip_punctuation, strip_tags, strip_numeric, remove_stopwords, strip_short]
df['Clean_news1'] = df['News'].apply(lambda x: preprocess_string(x, filters))
df.head()

Unnamed: 0,News,Label,Label_name,Clean_news,Clean_news1
0,\nmorgan and guzman will have era's 1 run high...,9,rec.sport.baseball,"[morgan, guzman, era, run, higher, year, cub, ...","[morgan, guzman, era, run, higher, year, cubs,..."
1,"Well, I just got my Centris 610 yesterday. It...",4,comp.sys.mac.hardware,"[got, centri, yesterdai, took, week, place, or...","[got, centris, yesterday, took, weeks, placing..."
2,Archive-name: cryptography-faq/part10\nLast-mo...,11,sci.crypt,"[archiv, cryptographi, faq, modifi, faq, sci, ...","[archive, cryptography, faq, modified, faq, sc..."
3,To the best of my knowledge there aren't any p...,4,comp.sys.mac.hardware,"[best, knowledg, aren, problem, quadra, blind,...","[best, knowledge, aren, problems, quadras, bli..."
4,\n\nI think that domestication will change beh...,0,alt.atheism,"[think, domest, chang, behavior, larg, degre, ...","[think, domestication, change, behavior, large..."


In [17]:
dataset_dictionary = Dictionary(df['Clean_news1'])
len(dataset_dictionary)

99291

In [18]:
print(dataset_dictionary.token2id)



In [20]:
dataset_corpus_bow = [dataset_dictionary.doc2bow(text) for text in df['Clean_news1']]
len(dataset_corpus_bow)


18846

In [21]:
print(dataset_corpus_bow[1])

[(17, 1), (18, 1), (19, 2), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 2), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 2), (39, 3), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 2), (47, 1), (48, 2), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 2), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 2), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1)]
