## Sentiment analysis of sci-kit learn's 20 newsgroups



In [1]:
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.datasets import fetch_20newsgroups
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


# Choose a few categories or all categories
categories=['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
#categories=None

newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)



In [2]:
# The training data comes as a list of Strings. This shows an example of a such String.
documents = newsgroups_train.data

documents[0]



In [3]:
# Names of the targets (news groups)
target_names = newsgroups_train.target_names
target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [4]:
# Prepare the documents by setting them to lowercase and tokenize them. 
documents = [d.lower() for d in documents]
documents = [word_tokenize(d) for d in documents]

documents[0][:20]

['from',
 ':',
 'rych',
 '@',
 'festival.ed.ac.uk',
 '(',
 'r',
 'hawkes',
 ')',
 'subject',
 ':',
 '3ds',
 ':',
 'where',
 'did',
 'all',
 'the',
 'texture',
 'rules',
 'go']

In [5]:
# Use the gensim Word2Vec class to train an embedding.

vector_dim = 200

model = Word2Vec(
    documents,
    size=vector_dim,
    window=3,
    min_count=2,
    workers=5)
model.train(documents, total_examples=len(documents), epochs=10)

(5480217, 7873910)

In [6]:
# Grab the word vectors. These will work as the embeddings in the neural network.
WordVectors = model[model.wv.vocab]

# vocab is a dictionary with the vocabulary defined by the Word2Vec model as keys.
vocab = model.wv.vocab

  from ipykernel import kernelapp as app


In [7]:
# Change the documents into lists of integers where each integer is the index of the word in the embedding.
# The lists are sequences must all have the same length, MAX_SEQUENCE_LENGTH, so those that are too long
# are truncated while those that are too short are padded with zeros.
MAX_SEQUENCE_LENGTH = 200

word_index = {t: i for i,t in enumerate(list(vocab))}
sequences = [[word_index.get(t,0) for t in document] for document in documents]
X_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

# Prepare the targets by giving them a one hot encoding.
y_train = newsgroups_train.target
#y_train = to_categorical(y_train)

In [8]:
# Prepare the test data

test_docs = newsgroups_test.data
test_label = newsgroups_test.target

test_docs = [d.lower() for d in test_docs]
test_docs = [word_tokenize(d) for d in test_docs]
    
test_sequences = [[word_index.get(t,0) for t in document] for document in test_docs]
X_test = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

y_test = test_label
#y_test = to_categorical(test_label)

In [9]:
X_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         1,   2,   3,   4,   5,   6,   7,   8,   9,   1,  10,   1,  11,
        12,  13,  14,  15,  16,  17,  18,  19,   1,  20,  21,  22,  23,
        24,  25,  26,  27,  28,  29,  30,  31,  32,   5,  33,  13,  34,
        35,  36,  37,  38,   8,  39,  31,  40,  41,  26,  42,  28,  43,
        44,  45,   0,  10,  22,  46,  47,  48,  31,  49,  50,  51,  52,
        53,  54,  27,  28,  30,  39,  31,  55,  41,  56,   0,  47,  57,
        53,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  14,  40,
        41,  18,  68,  64,  69,  70,  67,  14,  71,  72,  73,  15,  16,
        67,  14,  55,  41,  53,  23,  74,  75,  39,  76,  77,  39,  78,
        14,  15,  79,  63,  22,  58,  59,  80,  14,  81,  82,  14,  55,
        41,  18,  64,  14,   0,  41,  81,  83,   0,  84,  18,   2,  85,
         0,   7,  86,   1,   2,   3,   4,  87,  88,  89,  90,  5

In [10]:
test_label[0:10]

array([2, 1, 1, 1, 1, 1, 2, 2, 0, 2], dtype=int64)

In [11]:
documents[3]

['from',
 ':',
 'dpw',
 '@',
 'sei.cmu.edu',
 '(',
 'david',
 'wood',
 ')',
 'subject',
 ':',
 'request',
 'for',
 'support',
 'organization',
 ':',
 'software',
 'engineering',
 'institute',
 'lines',
 ':',
 '35',
 'i',
 'have',
 'a',
 'request',
 'for',
 'those',
 'who',
 'would',
 'like',
 'to',
 'see',
 'charley',
 'wingate',
 'respond',
 'to',
 'the',
 '``',
 'charley',
 'challenges',
 "''",
 '(',
 'and',
 'judging',
 'from',
 'my',
 'e-mail',
 ',',
 'there',
 'appear',
 'to',
 'be',
 'quite',
 'a',
 'few',
 'of',
 'you',
 '.',
 ')',
 'it',
 'is',
 'clear',
 'that',
 'mr.',
 'wingate',
 'intends',
 'to',
 'continue',
 'to',
 'post',
 'tangential',
 'or',
 'unrelated',
 'articles',
 'while',
 'ingoring',
 'the',
 'challenges',
 'themselves',
 '.',
 'between',
 'the',
 'last',
 'two',
 're-postings',
 'of',
 'the',
 'challenges',
 ',',
 'i',
 'noted',
 'perhaps',
 'a',
 'dozen',
 'or',
 'more',
 'posts',
 'by',
 'mr.',
 'wingate',
 ',',
 'none',
 'of',
 'which',
 'answered',
 'a',
 

In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM, Dropout

k_model = Sequential()

k_model.add(Embedding(len(vocab), vector_dim, weights=[WordVectors], 
                      input_length=MAX_SEQUENCE_LENGTH, trainable=False))

k_model.add(LSTM(vector_dim, return_sequences=True))
#k_model.add(Dropout(0.2))

k_model.add(LSTM(vector_dim, return_sequences=True))
#k_model.add(Dropout(0.2))

k_model.add(Flatten())

k_model.add(Dense(512, activation=tf.nn.relu))

#k_model.add(Dense(512, activation=tf.nn.relu))
k_model.add(Dense(len(target_names), activation=tf.nn.softmax))

In [15]:
k_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

k_model.fit(X_train, y_train, epochs=10, batch_size=32, verbose = 1, validation_data=[X_test, y_test])

Train on 2034 samples, validate on 1353 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1a6fabe6c88>