In [65]:
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, chi2

from sklearn.model_selection import train_test_split

from sklearn.utils import check_random_state
from sklearn.decomposition._online_lda import _dirichlet_expectation_2d

import utils as u

from lda_directed import DirectedLDA

from sklearn.preprocessing import MinMaxScaler

import scipy.sparse

from sklearn.svm import LinearSVC

In [2]:
df = pd.read_pickle('../pandas/lemma_delivered_merged_df.pkl')

In [3]:
df['is_nn'] = pd.read_pickle('../pandas/is_nn_full.pkl')

In [4]:
labeled_corpus = df[(df['agg_label'] != -1) & (df['is_nn'] == False)]
unlabeled_corpus = df[(df['agg_label'] == -1) & (df['is_nn'] == False)]

target = 'agg_label'
text = 'lemma_delivered'
fasttext = 'ft'
numeric = ['raw_len', 'raw_word_count']

In [5]:
train_X, vali_X, train_y, vali_y = train_test_split(
    labeled_corpus,
    labeled_corpus[target],
    test_size=0.4,
    random_state=1,
    stratify=labeled_corpus[target])

test_X, validation_X, test_y, validation_y = train_test_split(
    vali_X,
    vali_y,
    test_size=0.5,
    random_state=1,
    stratify=vali_y)

In [30]:
def direct_score_vocab(train_X, train_y, n_top_words=5400):

    vec = CountVectorizer(**{'lowercase': False, 'max_df': 0.25})
    vec_train_X = vec.fit_transform(train_X)

    score, _ = chi2(vec_train_X, train_y)

    vocab = vec.get_feature_names()

    result = [vocab[i] for i in score.argsort()[:-n_top_words - 1:-1]]
    
    return result

In [79]:
direct_vocab = direct_score_vocab(train_X[text], train_y, n_top_words=35000)

In [33]:
len(direct_vocab)

10000

In [84]:
class_vocabs = u.get_vocabs(train_X[text], train_y, CountVectorizer(**{'lowercase': False, 'max_df': 0.25}))
merged_vocabs = u.merge_vocabs_by_score(class_vocabs, train_X[text], train_y, 
                                      CountVectorizer, {'lowercase': False, 'max_df': 0.25}, 
                                      chi2, 700)

In [85]:
len(merged_vocabs)

17415

In [34]:
vec = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25, 'vocabulary': direct_vocab})

vec_train_X = vec.fit_transform(train_X[text])
vec_validation_X = vec.transform(validation_X[text])

clf = LinearSVC(**{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01})
clf.fit(vec_train_X, train_y)

clf_preds = clf.predict(vec_validation_X)
np.mean(clf_preds == validation_y)

0.6590207914151576

In [35]:
def get_topic_preds_probs(sample, train_X, train_y):

    vec = CountVectorizer(**{'lowercase': False, 'vocabulary': direct_vocab})

    vec_train_X = vec.fit_transform(train_X)
    vec_sample_X = vec.transform(sample)
    
    priors = u.get_priors(vec_train_X, train_y, chi2)
    
    num_classes = len(set(train_y))
    
    lda = DirectedLDA(n_components=num_classes, max_iter=2,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=-1,
        ptws=priors)
    
    lda.fit(vec_train_X)
    
    topic_train_probs = lda.transform(vec_train_X)
    topic_probs = lda.transform(vec_sample_X)
    topic_preds = [priors[x][0] for x in np.argmax(topic_probs, axis=1)]
    
    return topic_train_probs, topic_probs, topic_preds

In [36]:
topic_train_probs, topic_probs, topic_preds = get_topic_preds_probs(validation_X[text], train_X[text], train_y)

np.mean(topic_preds == validation_y)

0.02749832327297116

In [37]:
topic_probs

array([[nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [40]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [86]:
vec = CountVectorizer(**{'lowercase': False, 'vocabulary': merged_vocabs})
#vec = CountVectorizer(**{'lowercase': False, 'max_df': 0.25})

vec_train_X = vec.fit_transform(train_X[text])
vec_sample_X = vec.transform(validation_X[text])

priors = u.get_priors(vec_train_X, train_y, chi2)

lda = DirectedLDA(n_components=len(set(train_y)), max_iter=2,
        learning_method='online',
        learning_offset=50.,
        random_state=0,
        n_jobs=-1,
        ptws=priors)
    
lda.fit(vec_train_X)

DirectedLDA(batch_size=128, doc_topic_prior=None, evaluate_every=-1,
            learning_decay=0.7, learning_method='online', learning_offset=50.0,
            max_doc_update_iter=100, max_iter=2, mean_change_tol=0.001,
            n_components=27, n_jobs=-1, perp_tol=0.1,
            ptws=[(1,
                   array([0.14162873, 0.19828022, 0.62316641, ..., 0.05665149, 1.71405569,
       0.08497724])),
                  (2,
                   array([0.09245343, 0.1294348 , 0.4...
       1.40894569e+02, 4.25850340e-01, 4.25850340e-02])),
                  (91,
                   array([0.41476998, 0.43445976, 1.82498789, ..., 0.16590799, 2.48861985,
       0.24886199])),
                  (92,
                   array([2.30132373e-01, 3.22185322e-01, 2.12761189e+02, ...,
       9.20529492e-02, 1.38079424e+00, 1.38079424e-01])),
                  (93,
                   array([0.45785132, 0.64099185, 2.01454581, ..., 0.18314053, 2.74710792,
       0.27471079]))],
            random_s

In [87]:
tf_feature_names = vec.get_feature_names()
print_top_words(lda, tf_feature_names, 25)

Topic #0: prosent krone øke enn milliard lav høy 000 penge skatt tall økonomi bank vekst inntekt million økonomisk betale fjor all rente stat neste redusere falle
Topic #1: kvinne mann samfunn mot muslim debatt all homofil lov muslimsk jøde demokrati rettighet forbud islam kultur ytringsfrihet politisk religiøs holdning frihet vold menneske menneskerettighet abort
Topic #2: sykehus ved lege pasient behandling sykdom enn kvinne medisin syk undersøkelse psykisk kropp medisinsk helse forsker alvorlig all risiko hos vise øke dø Ullevål forskning
Topic #3: mat spise hund dyr produkt fisk all enn bonde kilo kjøtt fiske katt laks frukt Mattilsynet sunn inneholde butikk forbruker Tine gård melk vann matvare
Topic #4: ansatt jobb bedrift jobbe offentlig LO lønn arbeid pensjon ordning leder privat stilling arbeidsliv NHO pensjonist Nav streik enn sektor arbeidsplass ledelse direktør virksomhet medarbeider
Topic #5: skole elev ved universitet fag videregående utdanning professor lære klasse all f

In [88]:
topic_train_probs = lda.transform(vec_train_X)
topic_probs = lda.transform(vec_sample_X)
topic_preds = [priors[x][0] for x in np.argmax(topic_probs, axis=1)]

In [89]:
np.mean(topic_preds == validation_y)

0.3761234071093226

In [74]:
vec = TfidfVectorizer(**{'lowercase': False, 'max_df': 0.25})

vec_train_X = vec.fit_transform(train_X[text])
vec_validation_X = vec.transform(validation_X[text])

scaler = MinMaxScaler()

scaled_topic_train_probs = scaler.fit_transform(topic_train_probs)
scaled_topic_probs = scaler.fit_transform(topic_probs)

In [75]:
merged_train_X = scipy.sparse.hstack([vec_train_X, scaled_topic_train_probs])
merged_validation_X = scipy.sparse.hstack([vec_validation_X, scaled_topic_probs])

In [77]:
clf = LinearSVC(**{'C': 0.5, 'loss': 'squared_hinge', 'tol': 0.01})
clf.fit(vec_train_X, train_y)

clf_preds = clf.predict(vec_validation_X)
np.mean(clf_preds == validation_y)

0.67981220657277