In [1]:
from torchtext import data, datasets
from gensim import models, corpora
from sklearn.metrics import average_precision_score, f1_score
import numpy as np
import pandas as pd
from tqdm import tqdm
from datasets_loader import dataset_loader
import pickle

DATASET = 'agn'
VERSION = 'v2'

In [2]:
data_loader = dataset_loader(None, None)

In [4]:
# Get dataset (can be obtained from https://github.com/mhjabreel/CharCNN/tree/master/data/ag_news_csv or 
# from https://www.di.unipi.it/~gulli/AG_corpus_of_news_articles.html)

train, test, text_field, label_field = data_loader.get_dataset(DATASET)

print('Train length:',str(len(train)))
print('Test length:',str(len(test)))

Train length: 120000
Test length: 7600


### Classes correspondence in AG News
* 1 - World
* 2 - Sports
* 3 - Business
* 4 - Science and technology

In [None]:
def get_tokenized_data(dataset):
    return [example.text for example in dataset.examples]

tokenized_data_train = get_tokenized_data(train)
tokenized_data_test = get_tokenized_data(test)

In [None]:
def get_labels(dataset):
    return np.array( [example.label for example in dataset.examples] )
labels_test = get_labels(test)

In [None]:
num_topics = len(label_field.vocab)

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data_train)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data_train]

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
corpus_test = [dictionary.doc2bow(text) for text in tokenized_data_test]

In [None]:
# Get topics probabilities
def get_topics_probabilities(lda_model):
    topic_dist_test = lda_model[corpus_test]
    results = []
    for lda_topic in range(num_topics):
         for real_topic in label_field.vocab.stoi.keys():
                y_true = (labels_test == real_topic)
                support = sum(y_true)
                y_score = [dict(probs)[lda_topic] if lda_topic in dict(probs).keys() else 0. 
                           for probs in np.array(topic_dist_test) ]
                pr_auc = average_precision_score(y_true=y_true, y_score=y_score)          
                results.append([lda_topic, real_topic, pr_auc, support])

    results_df = pd.DataFrame(results, columns=['lda_topic','real_topic','pr_auc','support']).sort_values('pr_auc', ascending=False)
    return results_df

results_df = get_topics_probabilities(lda_model)

In [None]:
def get_best_model_and_topics(results_df):
    classes = dict(label_field.vocab.stoi)
    model_perf_data = []
    for row in results_df.values:
        class_ = row[1]
        if class_ in classes.keys():
            lda_topic = row[0]
            pr_auc = row[2]
            support = row[3]
            model_perf_data.append([class_, lda_topic, pr_auc, support])
            classes.pop(class_)
    model_perf_df = pd.DataFrame( model_perf_data )
    mean_perf = model_perf_df[2].mean()
    return model_perf_df, mean_perf
    
model_perf_df, mean_perf = get_best_model_and_topics(results_df)
print(model_perf_df)
print('Mean performance:', mean_perf)

   0  1         2     3
0  1  1  0.874956  1900
1  2  0  0.860896  1900
2  4  2  0.643909  1900
3  3  3  0.473910  1900
Mean performance: 0.7134179210399378


In [None]:
alphas = [0.1, 0.25, 0.5, 0.75, 1, 1.25 ]
etas = [0.1, 0.25, 0.5, 0.75, 1, 1.25 ]
model_perf_df_results = {}
mean_perf_results = {}
all_models = {}

best_perf = 0

for alpha in tqdm(alphas):
    for eta in etas:
        lda_model = models.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=5, distributed=False, alpha=alpha, eta=eta)
        results_df = get_topics_probabilities(lda_model)
        model_perf_df, mean_perf = get_best_model_and_topics(results_df)
        model_perf_df_results[str([alpha, eta])] = model_perf_df
        mean_perf_results[str([alpha, eta])] = mean_perf
        all_models[str([alpha, eta])] = lda_model
        
        if mean_perf > best_perf: 
            best_perf = mean_perf
            #lda_model.save('best_model.pickle')
            print('->',alpha, eta, best_perf)
        else:
            print(alpha, eta, mean_perf)
        
        
        pickle.dump( model_perf_df_results, open('model_perf_df_results_'+DATASET+'_'+VERSION+'.pickle', 'wb'))
        pickle.dump( mean_perf_results, open('mean_perf_results_'+DATASET+'_'+VERSION+'.pickle', 'wb'))
        pickle.dump( all_models, open('all_models_'+DATASET+'_'+VERSION+'.pickle', 'wb'))


  0%|          | 0/6 [00:00<?, ?it/s]

-> 0.1 0.1 0.6156956371946853
-> 0.1 0.25 0.6709602612284461
-> 0.1 0.5 0.8551500256441416
-> 0.1 0.75 0.8797275744452677
0.1 1 0.7996934359227011


 17%|█▋        | 1/6 [46:01<3:50:09, 2761.91s/it]

0.1 1.25 0.7614413543488324
0.25 0.1 0.8540918321709707
-> 0.25 0.25 0.888168441946272
