<a href="https://colab.research.google.com/github/bostelma/ATiML-Project/blob/master/TopicModellingClass.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from sklearn.model_selection import StratifiedShuffleSplit
import gensim
from gensim import corpora
import numpy as np
import random

In [12]:
class TopicModeller():

  def __init__( self ):
    self.genres = []
    self.books = []
    self.NUM_TOPICS = 10
    self.PERCENTAGE = 0.2

  def setNumberOfTopics( self, value ):
    self.NUM_TOPICS = value

  def setPercentage( self, value ):
    self.PERCENTAGE = value

  def loadData( self, path ):
    with open(path, 'rb') as f:
        self.genres = np.load(f, allow_pickle=True)
        self.books = np.load(f, allow_pickle=True)

  def getFeatures( self, trainIndices, testIndices ):

    books_train, books_test = self.books[trainIndices], self.books[testIndices]
    genres_train, genres_test = self.genres[trainIndices], self.genres[testIndices]

    arr = []

    for i in range( len(genres_train ) ):
      if genres_train[i] == 'Literary':
        val = random.random()
        if val > self.PERCENTAGE:
          continue
      arr.append( books_train[i])

    # Create the topics
    NUM_WORDS  = 4

    dictionary = corpora.Dictionary( arr )
    dictionary.filter_extremes(no_below=20, no_above=0.5)
    corpus = [ dictionary.doc2bow( text ) for text in arr ]

    # Set training parameters.
    num_topics = self.NUM_TOPICS
    chunksize = 2000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    ldamodel = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        chunksize=chunksize,
        alpha='auto',
        eta='auto',
        iterations=iterations,
        num_topics=num_topics,
        passes=passes,
        eval_every=eval_every
    )

    # Print out the topics
    topics = ldamodel.print_topics( num_words=NUM_WORDS )
    print("The following topics were generated:")
    for topic in topics:
      print( topic )

    # Process the books and get final training data
    X_train = []
    Y_train = genres_train # TODO do I have to preprocess it as well?

    for book in books_train:

      # Get the topic weights
      bow = dictionary.doc2bow( book )
      topics = ldamodel.get_document_topics( bow )

      # Convert the vector of dynamic length to
      # constant length feature vector
      x = [0] * self.NUM_TOPICS
      for topic in topics:
        x[topic[0]] = topic[1]
      X_train.append(x)

    # Prepare our test data in the same way
    X_test = []
    Y_test = genres_test # TODO do I have to preprocess it as well?

    for book in books_test:

      # Get the topic weights
      bow = dictionary.doc2bow( book )
      topics = ldamodel.get_document_topics(bow)

      # Convert the vector of dynamic length to
      # constant length feature vector
      x = [0] * self.NUM_TOPICS
      for topic in topics:
        x[topic[0]] = topic[1]
      X_test.append(x)

    return X_test, X_train, Y_test, Y_train

Example code on how to use this class.

In [14]:
modeller = TopicModeller()#
modeller.loadData( 'prepared_tokens.npy' )

NUMBER_OF_SPLITS = 5
TEST_SIZE = 1 / 3

sss = StratifiedShuffleSplit(
    n_splits=NUMBER_OF_SPLITS,
    test_size=TEST_SIZE,
    random_state=0
)

splits = sss.split( modeller.books, modeller.genres )

train_index = []
test_index = []

for tr, te in splits:
  train_index = tr
  test_index = te

X_test, X_train, Y_test, Y_train = modeller.getFeatures( train_index, test_index )

The following topics were generated:
(0, '0.005*"judge" + 0.004*"james" + 0.003*"hill" + 0.003*"flower"')
(1, '0.004*"uncle" + 0.004*"london" + 0.003*"england" + 0.002*"fortune"')
(2, '0.015*"ship" + 0.013*"captain" + 0.008*"deck" + 0.006*"boat"')
(3, '0.005*"aunt" + 0.004*"desert" + 0.003*"uncle" + 0.003*"lovely"')
(4, '0.006*"horse" + 0.004*"indian" + 0.002*"shadow" + 0.002*"spot"')
(5, '0.005*"captain" + 0.005*"detective" + 0.003*"count" + 0.003*"jewel"')
(6, '0.004*"grace" + 0.003*"hall" + 0.003*"american" + 0.003*"squire"')
(7, '0.003*"train" + 0.003*"detective" + 0.003*"towards" + 0.003*"shook"')
(8, '0.005*"mary" + 0.004*"lord" + 0.004*"robert" + 0.003*"towards"')
(9, '0.011*"john" + 0.008*"temple" + 0.006*"hall" + 0.004*"uncle"')
