In [1]:
import sklearn

import numpy as np
import pandas as pd
# import jtplot submodule from jupyterthemes
from jupyterthemes import jtplot
# currently installed theme will be used to
# set plot style if no arguments provided
jtplot.style()

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
newsgroups_data = fetch_20newsgroups()

In [4]:
newsgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print(newsgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [6]:
newsgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
print(newsgroups_data.data[4])

From: jcm@head-cfa.harvard.edu (Jonathan McDowell)
Subject: Re: Shuttle Launch Question
Organization: Smithsonian Astrophysical Observatory, Cambridge, MA,  USA
Distribution: sci
Lines: 23

From article <C5owCB.n3p@world.std.com>, by tombaker@world.std.com (Tom A Baker):
>>In article <C5JLwx.4H9.1@cs.cmu.edu>, ETRAT@ttacs1.ttu.edu (Pack Rat) writes...
>>>errors. ...".  I am wondering what an "expected error" might
>>>be.  Sorry if this is a really dumb question, but
> 
> Parity errors in memory or previously known conditions that were waivered.
>    "Yes that is an error, but we already knew about it"
> I'd be curious as to what the real meaning of the quote is.
> 
> tom


My understanding is that the 'expected errors' are basically
that don't have the right values in yet because they aren't
set till after launch, and suchlike. Rather than fix the code
and possibly introduce new bugs, they just tell the crew

 - Jonathan





In [8]:
print(newsgroups_data.target[4])

14


In [9]:
np.unique(newsgroups_data.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [10]:
len(newsgroups_data.data), len(newsgroups_data.target)

(11314, 11314)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf_vect = TfidfVectorizer(stop_words='english')

newsgroups_data_transformed = tfidf_vect.fit_transform(newsgroups_data.data)

In [13]:
newsgroups_data_transformed.shape

(11314, 129796)

In [14]:
len(tfidf_vect.get_feature_names())

129796

In [15]:
import random

random.sample(tfidf_vect.vocabulary_.items(), 10)

[('3imrmc8rchz', 12910),
 ('lyourk', 77386),
 ('firstborn', 55432),
 ('cs225a82', 43690),
 ('c528he', 36373),
 ('13255', 3912),
 ('wums', 125131),
 ('deck', 45608),
 ('b9rnkjz', 31270),
 ('futserv', 57285)]

In [16]:
print(newsgroups_data_transformed[0])

  (0, 86416)	0.14330464297977982
  (0, 35135)	0.10188109676312235
  (0, 65968)	0.10658183340971177
  (0, 114195)	0.06002582888934523
  (0, 78809)	0.06524029473980168
  (0, 76578)	0.0752490171119318
  (0, 57203)	0.16977226500364592
  (0, 67023)	0.07965653370342658
  (0, 63238)	0.09086750717799585
  (0, 95944)	0.11792442679286105
  (0, 127721)	0.0660283455431985
  (0, 109044)	0.11811852219269026
  (0, 51651)	0.10581100308545811
  (0, 83103)	0.09633120317294654
  (0, 113755)	0.1926949257821117
  (0, 73061)	0.04662587301170703
  (0, 34131)	0.09493746671845804
  (0, 101175)	0.08899924936054199
  (0, 105907)	0.10749912859686628
  (0, 35560)	0.1446512460011004
  (0, 26070)	0.10385185139503332
  (0, 108033)	0.08197182211166716
  (0, 99619)	0.06171903092868097
  (0, 48552)	0.1263844988551673
  (0, 34943)	0.18203649549572573
  :	:
  (0, 76574)	0.09842306773884467
  (0, 109354)	0.11773212031617089
  (0, 48550)	0.10908149802523066
  (0, 45232)	0.07212208178051426
  (0, 104609)	0.09217540920934716


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
x_train, x_test, y_train, y_test = train_test_split(newsgroups_data_transformed, 
                                                    newsgroups_data.target, 
                                                    shuffle=True,
                                                    test_size = 0.2)

In [19]:
x_train.shape, y_train.shape

((9051, 129796), (9051,))

In [20]:
x_test.shape, y_test.shape

((2263, 129796), (2263,))

In [21]:
from sklearn.neural_network import MLPClassifier

In [22]:
mlp_clf = MLPClassifier(activation = 'relu',
                    hidden_layer_sizes= (32,), 
                    solver='adam', 
                    verbose=True,
                    max_iter=50)

In [23]:
mlp_clf.fit(x_train, y_train)

Iteration 1, loss = 2.91979691
Iteration 2, loss = 2.54876744
Iteration 3, loss = 2.05052709
Iteration 4, loss = 1.50769230
Iteration 5, loss = 1.03353860
Iteration 6, loss = 0.69317933
Iteration 7, loss = 0.47376976
Iteration 8, loss = 0.33521953
Iteration 9, loss = 0.24568105
Iteration 10, loss = 0.18590373
Iteration 11, loss = 0.14458157
Iteration 12, loss = 0.11527316
Iteration 13, loss = 0.09384652
Iteration 14, loss = 0.07792543
Iteration 15, loss = 0.06573139
Iteration 16, loss = 0.05627018
Iteration 17, loss = 0.04889857
Iteration 18, loss = 0.04291646
Iteration 19, loss = 0.03820970
Iteration 20, loss = 0.03424047
Iteration 21, loss = 0.03096366
Iteration 22, loss = 0.02829845
Iteration 23, loss = 0.02601416
Iteration 24, loss = 0.02399142
Iteration 25, loss = 0.02234809
Iteration 26, loss = 0.02091053
Iteration 27, loss = 0.01955650
Iteration 28, loss = 0.01848034
Iteration 29, loss = 0.01753514
Iteration 30, loss = 0.01662508
Iteration 31, loss = 0.01585798
Iteration 32, los



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(32,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=50,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=True,
              warm_start=False)

In [24]:
y_pred = mlp_clf.predict(x_test)

In [25]:
pred_results = pd.DataFrame({'y_test': y_test,
                             'y_pred': y_pred})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
371,14,14
70,0,0
1408,13,13
2004,10,10
1905,7,7
2113,0,0
2189,10,10
177,12,3
24,18,18
1929,11,11


In [26]:
newsgroups_data.target_names[11]

'sci.crypt'

In [27]:
newsgroups_data_crosstab = pd.crosstab(pred_results.y_test, pred_results.y_pred)

newsgroups_data_crosstab

y_pred,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
y_test,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,96,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2
1,0,99,5,4,1,1,3,1,0,0,0,0,0,0,2,0,0,0,0,0
2,0,4,94,6,0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,6,2,98,5,0,2,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,2,0,8,108,0,1,0,0,0,0,0,2,0,0,0,0,0,0,0
5,0,1,2,3,0,118,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,1,3,1,0,93,0,1,1,1,0,0,0,0,0,0,0,0,0
7,0,0,0,3,0,0,3,101,2,0,0,0,1,0,0,0,0,0,0,0
8,0,1,0,0,0,1,0,2,113,0,0,0,0,0,0,0,0,0,0,0
9,0,1,0,0,0,0,1,0,0,121,0,1,0,0,0,0,0,0,0,0


In [28]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9288555015466196