In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
from gensim import corpora
import pickle
from pathlib import Path
from io import FileIO
import pyLDAvis.gensim
from gensim.models import CoherenceModel, LdaModel, LdaMulticore
import pandas as pd
from youbemom import create_connection
import csv
import random
import numpy as np
import math
import os

## Functions

For data

In [3]:
def load_data():
    lemmatized_text = pickle.load(open(path_lemma_pkl.format(forum, group), 'rb'))
    corpus = pickle.load(open(path_corpus_pkl.format(forum, group), 'rb'))
    dictionary = corpora.Dictionary.load(path_dictionary_gensim.format(forum, group))
    return lemmatized_text, corpus, dictionary

  and should_run_async(code)


In [4]:
def save_model(model):
    model.save(path_saved_model.format(forum, group))

  and should_run_async(code)


In [5]:
def save_coherence(alpha, beta, topics, iterations, coherence):
    with open(path_saved_parameters.format(forum, group), 'a') as f:
        writer = csv.writer(f) 
        writer.writerow([alpha, beta, topics, iterations, coherence])

  and should_run_async(code)


For SA

In [6]:
# initial state
def initialize_state(lower_bounds, upper_bounds):
#     state = [random.randint(lower_bounds[0], upper_bounds[0]),
#             random.randint(lower_bounds[1], upper_bounds[1])]
#     noise = [random.random() for i in range(2)]
#     state += [l + r * (u - l) for l, u, r in zip(lower_bounds[2:], upper_bounds[2:], noise)]
    noise = [random.random() for i in lower_bounds]
    state = [l + r * (u - l) for l, u, r in zip(lower_bounds, upper_bounds, noise)]
    return state

  and should_run_async(code)


In [7]:
# get neighbors
def get_neighbor(state, eta, indpb, low, up):
    """Polynomial mutation as implemented in original NSGA-II algorithm in
    C by Deb.
    :param state: a list of values to be mutated.
    :param eta: Crowding degree of the mutation. A high eta will produce
                a mutant resembling its parent, while a small eta will
                produce a solution much more different.
    :param indpb: Probability of the mutation for each gene. A high p
                  will make genes more likely to mutate and a low p
                  will make genes less likely to mutate.
    :param low: A list of values that is the lower bound of the search
                space.
    :param up: A list of values values that is the upper bound of the
               search space.
    :returns: A list of the mutated state within the given bounds.
    adapted from:
    https://github.com/DEAP/deap/blob/master/deap/tools/mutation.py
    """
    size = len(state)
    new_state = state.copy()
    
    for i, xl, xu in zip(range(size), low, up):
        if random.random() <= indpb:
            x = new_state[i]
            delta_1 = (x - xl) / (xu - xl)
            delta_2 = (xu - x) / (xu - xl)
            rand = random.random()
            mut_pow = 1.0 / (eta + 1.)

            if rand < 0.5:
                xy = 1.0 - delta_1
                val = 2.0 * rand + (1.0 - 2.0 * rand) * xy ** (eta + 1)
                delta_q = val ** mut_pow - 1.0
            else:
                xy = 1.0 - delta_2
                val = 2.0 * (1.0 - rand) + 2.0 * (rand - 0.5) * xy ** (eta + 1)
                delta_q = 1.0 - val ** mut_pow

            x = x + delta_q * (xu - xl)
            x = min(max(x, xl), xu)
            new_state[i] = x
    
    return new_state

  and should_run_async(code)


In [8]:
# get cost
def get_cost(state):
    print(state)
    n_topics = int(round(state[0]))
    n_iterations = 50 # int(round(state[1]))
    a = state[1]
    b = state[2]
    # do LDA
    model = LdaMulticore(corpus, num_topics = n_topics, id2word=dictionary, passes=n_iterations, alpha=a, eta=b, workers=w)
    # get coherence
    coherence_model = CoherenceModel(model=model, texts=lemmatized_text, dictionary=dictionary, coherence='c_v')
    coherence = coherence_model.get_coherence()
    save_coherence(a, b, n_topics, n_iterations, coherence)
    return model, coherence

  and should_run_async(code)


## Path

In [9]:
p = Path.cwd()
path_parent = p.parents[0]

  and should_run_async(code)


In [22]:
path_lemma_pkl = str(path_parent / "clean_data" / "lemmatized_text_{0}_{1}.pkl")
path_corpus_pkl = str(path_parent / "clean_data" / "corpus_{0}_{1}.pkl")
path_dictionary_gensim = str(path_parent / "clean_data" / "dictionary_{0}_{1}.gensim")
path_saved_model = str(path_parent / "clean_data" / "ldamodels" / "model_sa_tune_{0}_{1}_round_{2}.gensim")
path_saved_parameters = str(path_parent / "clean_data" / "params_sa_tune_{0}_{1}_round_{2}.csv")

  and should_run_async(code)


## Load Data

In [23]:
forum = "special-needs"
group = "parent"
r = 2

  and should_run_async(code)


In [12]:
lemmatized_text, corpus, dictionary = load_data()

  and should_run_async(code)


## Initiate Values

In [13]:
random.seed(384)

  and should_run_async(code)


In [14]:
# multicore workers
w = 9

  and should_run_async(code)


In [15]:
# temperature
initial_temp = 100
final_temp = 0
delta = 1
current_temp = initial_temp

  and should_run_async(code)


In [16]:
# bounds
# 1. number of topics (int)
# 2. alpha (float)
# 3. beta (float)
lower_bounds = [2, 0.001, 0.001]
upper_bounds = [20, 1.0, 1.0]

  and should_run_async(code)


In [17]:
initial_state = initialize_state(lower_bounds, upper_bounds)
current_state = initial_state.copy()
solution = current_state.copy()

  and should_run_async(code)


In [18]:
current_state

  and should_run_async(code)


[13.601302036499723, 85.98219375217231, 0.7828901238418017, 0.7412604389008841]

In [19]:
if os.path.exists(path_saved_parameters.format(forum, group)):
  os.remove(path_saved_parameters.format(forum, group))

  and should_run_async(code)


## Run SA

In [20]:
model, current_cost = get_cost(current_state)
save_model(model)

[13.601302036499723, 85.98219375217231, 0.7828901238418017, 0.7412604389008841]


  and should_run_async(code)


In [21]:
while current_temp > final_temp:
    print("current temp: {}".format(current_temp))
    eta = 
    # Get a new neighbor that is different from current state
    neighbor = get_neighbor(current_state, 20, .25, lower_bounds, upper_bounds)
    while neighbor == current_state:
        neighbor = get_neighbor(current_state, 20, .25, lower_bounds, upper_bounds)
    print("using neighbor:")
    print(neighbor)
    
    # Check if neighbor is best so far
    model, neighbor_cost = get_cost(neighbor)
    cost_diff = current_cost - neighbor_cost

    # if the new solution is better (higher coherence), accept it
    if cost_diff < 0:
        current_state = neighbor
        current_cost = neighbor_cost
        save_model(model)
        print("cost diff: {}, new solution better".format(cost_diff))

    # if the new solution is not better (lower coherence), accept it with a
    # probability of e^(-cost/temp)
    else:
        print("cost diff: {}, new solution worse".format(cost_diff))
        if random.uniform(0, 1) < math.exp(-cost_diff / current_temp):
            current_state = neighbor
            current_cost = neighbor_cost
            save_model(model)
            print("accepting worse solution")

    # decrement the temperature (uses a linear cooling schedule)
    current_temp -= delta

current temp: 100
using neighbor:
[13.707305184806438, 85.98219375217231, 0.7828901238418017, 0.7359402896519851]
[13.707305184806438, 85.98219375217231, 0.7828901238418017, 0.7359402896519851]


  and should_run_async(code)


cost diff: -0.05583267626339983, new solution better
current temp: 99
using neighbor:
[14.236435092350439, 113.51477806831308, 0.7828901238418017, 0.7359402896519851]
[14.236435092350439, 113.51477806831308, 0.7828901238418017, 0.7359402896519851]
cost diff: 0.03125066997744386, new solution worse
accepting worse solution
current temp: 98
using neighbor:
[14.236435092350439, 115.78176593341166, 0.7828901238418017, 0.7359402896519851]
[14.236435092350439, 115.78176593341166, 0.7828901238418017, 0.7359402896519851]
cost diff: 0.029244135252064374, new solution worse
accepting worse solution
current temp: 97
using neighbor:
[14.236435092350439, 115.78176593341166, 0.7467759171514603, 0.7359402896519851]
[14.236435092350439, 115.78176593341166, 0.7467759171514603, 0.7359402896519851]
cost diff: 0.0020515486034466957, new solution worse
accepting worse solution
current temp: 96
using neighbor:
[14.236435092350439, 125.9892623776846, 0.7467759171514603, 0.7359402896519851]
[14.23643509235043

cost diff: 0.007438973583903774, new solution worse
accepting worse solution
current temp: 67
using neighbor:
[10.238442395199288, 145.27554597819534, 0.7943921156686315, 0.3747623769375453]
[10.238442395199288, 145.27554597819534, 0.7943921156686315, 0.3747623769375453]
cost diff: -0.02449566268156045, new solution better
current temp: 66
using neighbor:
[10.238442395199288, 142.13586837839716, 0.7943921156686315, 0.40628813348587745]
[10.238442395199288, 142.13586837839716, 0.7943921156686315, 0.40628813348587745]
cost diff: 0.006940224855874755, new solution worse
accepting worse solution
current temp: 65
using neighbor:
[11.142941001069884, 142.13586837839716, 0.7943921156686315, 0.40628813348587745]
[11.142941001069884, 142.13586837839716, 0.7943921156686315, 0.40628813348587745]
cost diff: 0.044091266220126824, new solution worse
accepting worse solution
current temp: 64
using neighbor:
[11.142941001069884, 128.2780072835411, 0.9166235912760725, 0.40628813348587745]
[11.142941001

cost diff: 0.038415640584732624, new solution worse
accepting worse solution
current temp: 35
using neighbor:
[15.36639504504538, 173.92391481224263, 0.8937067260118944, 0.4241589446654645]
[15.36639504504538, 173.92391481224263, 0.8937067260118944, 0.4241589446654645]
cost diff: -0.031175061936595194, new solution better
current temp: 34
using neighbor:
[15.44086584487334, 173.92391481224263, 0.8864782226334268, 0.4241589446654645]
[15.44086584487334, 173.92391481224263, 0.8864782226334268, 0.4241589446654645]
cost diff: -0.0256395072066542, new solution better
current temp: 33
using neighbor:
[15.44086584487334, 173.92391481224263, 0.8864782226334268, 0.49923310335304755]
[15.44086584487334, 173.92391481224263, 0.8864782226334268, 0.49923310335304755]
cost diff: -0.015515102856621166, new solution better
current temp: 32
using neighbor:
[16.729791040952968, 173.92391481224263, 0.8864782226334268, 0.5146309556857434]
[16.729791040952968, 173.92391481224263, 0.8864782226334268, 0.51463

cost diff: 0.012140024273350258, new solution worse
accepting worse solution
current temp: 3
using neighbor:
[17.003993801101036, 160.51506008882347, 0.8515303573546389, 0.3852957123336081]
[17.003993801101036, 160.51506008882347, 0.8515303573546389, 0.3852957123336081]
cost diff: -0.010445703090866343, new solution better
current temp: 2
using neighbor:
[17.003993801101036, 160.51506008882347, 0.8515303573546389, 0.3401930270597509]
[17.003993801101036, 160.51506008882347, 0.8515303573546389, 0.3401930270597509]
cost diff: 0.012675962763059156, new solution worse
accepting worse solution
current temp: 1
using neighbor:
[17.003993801101036, 160.51506008882347, 0.8887005651235037, 0.3401930270597509]
[17.003993801101036, 160.51506008882347, 0.8887005651235037, 0.3401930270597509]
cost diff: -0.03519908319058618, new solution better
