In [None]:
from sklearn.datasets import fetch_20newsgroups

In [None]:
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
x_train = newsgroups_train.data
x_test = newsgroups_test.data
y_train = newsgroups_train.target
y_test = newsgroups_test.target

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [None]:
print ("List of all 20 categories:")
print (newsgroups_train.target_names)
print ("\n")
print ("Sample Email:")
print (x_train[0])
print ("Sample Target Category:")
print (y_train[0])
print (newsgroups_train.target_names[y_train[0]])

List of all 20 categories:
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Sample Email:
From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is

In [None]:
# Used for pre-processing data

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import pandas as pd
from nltk import pos_tag
from nltk.stem import PorterStemmer

In [None]:
def preprocessing(text):
  text2 = " ".join("".join([" " if ch in string.punctuation else ch for ch in text]).split())
  tokens = [word for sent in nltk.sent_tokenize(text2) for word in nltk.word_tokenize(sent)]
  tokens = [word.lower() for word in tokens]
  stopwds = stopwords.words('english')
  tokens = [token for token in tokens if token not in stopwds]
  tokens = [word for word in tokens if len(word)>=3]
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(word) for word in tokens]
  tagged_corpus = pos_tag(tokens)
  Noun_tags = ['NN','NNP','NNPS','NNS']
  Verb_tags = ['VB','VBD','VBG','VBN','VBP','VBZ']
  lemmatizer = WordNetLemmatizer()

def prat_lemmatize(token,tag):
  if tag in Noun_tags:
    return lemmatizer.lemmatize(token,'n')
  elif tag in Verb_tags:
    return lemmatizer.lemmatize(token,'v')
  else:
    return lemmatizer.lemmatize(token,'n')
    pre_proc_text = " ".join([prat_lemmatize(token,tag) for token,tag in tagged_corpus])
    return pre_proc_text

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
x_train_preprocessed = []
for i in x_train:
  x_train_preprocessed.append(preprocessing(i))
x_test_preprocessed = []
for i in x_test:
  x_test_preprocessed.append(preprocessing(i))

In [None]:
# building TFIDF vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, ngram_range=(1, 2),stop_words='english', max_features= 10000,strip_accents='unicode',norm='l2')


In [None]:
print(x_train_preprocessed)

[None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, Non

In [None]:
value = None
x_train_preprocessed = "" if value is None else value
x_train_preprocessed

''

In [None]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta,Adam,RMSprop
from keras.utils import np_utils

In [None]:
np.random.seed(1337)
nb_classes = 20
batch_size = 64
nb_epochs = 20
Y_train = np_utils.to_categorical(y_train, nb_classes)

In [None]:
model = Sequential()
model.add(Dense(1000,input_shape= (10000,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(500))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
print (model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              10001000  
_________________________________________________________________
activation (Activation)      (None, 1000)              0         
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               500500    
_________________________________________________________________
activation_1 (Activation)    (None, 500)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 50)                2