In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')
print(list(newsgroups_train.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [10]:
len(newsgroups_train.target)

11314

In [11]:
newsgroups_test = fetch_20newsgroups(subset='test')

In [12]:
len(newsgroups_test.target)

7532

In [13]:
newsgroups = fetch_20newsgroups(subset='all')

In [15]:
print(list(newsgroups.target_names))

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [63]:
vectorizer = TfidfVectorizer(stop_words='english')
vectors = vectorizer.fit_transform(newsgroups.data)
vectors.shape

(18846, 173451)

In [64]:
classes = newsgroups.target
classes.shape

(18846,)

In [65]:
newsgroups.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR', 'description'])

In [66]:
print('total number of zeros:', vectors.nnz)
print('{} average zero per data sample'.format(vectors.nnz / float(vectors.shape[0])))

total number of zeros: 2142075
113.66205030245145 average zero per data sample


In [67]:
ratio = 0.1
indices = np.random.permutation(vectors.shape[0])
split1 = int(np.floor(ratio*vectors.shape[0]))
split2 = int(np.floor(ratio*vectors.shape[0]))*2
test_vectors = vectors[indices[:split1]]
valid_vectors = vectors[indices[split1:split2]]
train_vectors = vectors[indices[split2:]]

In [68]:
test_classes = classes[indices[:split1]]
valid_classes = classes[indices[split1:split2]]
train_classes = classes[indices[split2:]]

In [123]:
print(vectorizer.get_feature_names()[9000])

1994


In [85]:
tokenizer = vectorizer.build_tokenizer()

In [124]:
'1994' in tokenizer(newsgroups.data[indices[0]])

True

In [122]:
print(test_vectors.getrow(0))

  (0, 111509)	0.115033277103
  (0, 142795)	0.118532289583
  (0, 152659)	0.100697560918
  (0, 73484)	0.0922669673306
  (0, 136847)	0.100697560918
  (0, 58329)	0.0954580449945
  (0, 89293)	0.203342204814
  (0, 38627)	0.0897414963392
  (0, 103069)	0.0867271207017
  (0, 142362)	0.0870274514067
  (0, 103423)	0.0886515637655
  (0, 9000)	0.080753731646
  (0, 172977)	0.0974347262584
  (0, 100335)	0.0627250593986
  (0, 66584)	0.194869452517
  (0, 42440)	0.106602683515
  (0, 35545)	0.21320536703
  (0, 59596)	0.27157939166
  (0, 130528)	0.525850574437
  (0, 55153)	0.0848099152315
  (0, 117814)	0.0974347262584
  (0, 146480)	0.151765625773
  (0, 61870)	0.0718775653184
  (0, 80106)	0.142228985924
  (0, 159785)	0.250132039812
  :	:
  (0, 71842)	0.046293993626
  (0, 92244)	0.0499831374235
  (0, 166254)	0.0490374398542
  (0, 65020)	0.0300319712981
  (0, 122822)	0.140948398094
  (0, 13613)	0.0449986432282
  (0, 131520)	0.0386177944337
  (0, 65690)	0.0447233911447
  (0, 152674)	0.0295051005409
  (0, 1061