In [1]:
# import services.theme_extractor_api.main
from services.theme_extractor.cluster_job import ClusterJob
from services.theme_extractor.wv_model_job import WVModelJob





In [None]:
wvm = WVModelJob()

al = wvm.get_latest_article_load()

model = wvm.get_model_from_disk(al.id)

cj = ClusterJob(model, al.id)

In [2]:
articles = cj.filter_articles()

In [3]:
from services.theme_extractor.clusterer import Clusterer

c = Clusterer(model, articles, al.id, from_scratch=True, min_cluster_size=3, cluster_selection_epsilon=0.1)


In [4]:
m, t = c.create_themes_and_mapping()

In [5]:
from collections import Counter
from scipy import spatial
from gensim.models import Doc2Vec
import numpy as np
from typing import List
import os

from services.libs.data_model.processed_article import ProcessedArticle

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

from umap import UMAP
from hdbscan import HDBSCAN

from  services.theme_extractor.logger import logger



from services.libs.data_model.theme import Theme
from services.libs.data_model.article import Article
        
def __get_class_words_from_doc_selection(docs_in_class: List[ProcessedArticle], vecs, model: Doc2Vec):
        doc_dict = {}

        for doc in docs_in_class:
            for word in doc.words:
                if word in doc_dict:
                    doc_dict[word] += 1
                else:
                    doc_dict[word] = 1

        d = Counter(doc_dict)

        top_words = d.most_common(1000)

        word_2_vec_ranking = {}

        for word in top_words:
            
            if(word[0] not in model.wv.vocab):
                continue;
            
            word_vec = model[word[0]]
            av_vec = np.average(vecs, axis=0)


            similarity = 1 - spatial.distance.cosine(word_vec, av_vec)
            word_2_vec_ranking[word[0]] = similarity

        rank_counter = Counter(word_2_vec_ranking)

        return [w[0] for w in rank_counter.most_common(1000)]

In [6]:
import numpy as np
from collections import Counter


''
## votes
''
def av_run(votes: np.array):
    
    # Expect votes to be a 2D numpy array where each row is a ballot and each column represents a candidate. 
    # The value of the cell represents the preference order of that voter - lower = preference!

    n_ballots, n_candidates = votes.shape

    running=True

    n_round = 1

    losers = set()

    while(running):


        winners = []

        for ballot in votes:
            winners.append(np.where(ballot == np.amin(ballot))[0][0])

        vote_counts = Counter(winners).most_common()


        print(vote_counts)
        if vote_counts[0][1] > n_ballots / 2:
            print('Winner in round {}. The winner is {}'.format(n_round, vote_counts[0][0]))
            running = False
            return vote_counts
        else:
            
            loser = next(x[0] for x in reversed(vote_counts) if x[0] not in losers)
            votes[:, loser] = n_candidates + 1
            print('No winner in round {}. Loser was {}.'.format(n_round, loser))


        if n_round > n_candidates:
            print('Err! cancelling')        

    

In [160]:

from math import log

def get_doc_word_votes(doc_ids: List[str], words: list, model: Doc2Vec, coeff=1):
    
    votes = []
    

    for doc_id in doc_ids:
        vec = model.docvecs[doc_id];

        sims: list = []

        for other_word, cnt in words:
            other_vec =  model.wv.get_vector(other_word);
            sim = abs(model.wv.cosine_similarities(vec, [other_vec])) * (coeff * cnt)

            sims.append(sim) 
            

        
        sims_ordered = sorted(sims, reverse=True)
        sim_indexes = list([sims_ordered.index(sim) for sim in sims])
        votes.append(sim_indexes)
        

    return np.array(votes)



In [152]:
doc_arr = np.array(articles)
doc_arr_trimmed = doc_arr[:len(t)]
docs_in_class = doc_arr_trimmed[t == 3]
vecs = list([model.docvecs[doc.id] for doc in docs_in_class])

In [153]:
def get_words(model, doc_ids):
    words_union = []
    for doc_id in doc_ids:
        docvec = model.docvecs[doc_id]
        words_union += [wc[0] for wc in model.wv.similar_by_vector(docvec, topn=100, restrict_vocab=10000)]
    return [wc for wc in Counter(words_union)]

In [157]:
doc_ids = [doc.id for doc in docs_in_class]
print(len(docs_in_class))
words = get_words(model, doc_ids)
words_with_counts = [(word, model.wv.vocab[word].count) for word in words]
votes = get_doc_word_votes(doc_ids,words_with_counts, model, 0)

9


In [166]:
docs_in_class[0].words

['family',
 'struggle',
 'pay',
 'funeral',
 'face',
 'rise',
 'price',
 'country',
 'help',
 'council',
 'net',
 '£',
 'surplus',
 'cremation',
 'cemetery_burial',
 'observer',
 'learn',
 'surplus',
 'expect',
 'rise',
 'significantly',
 'year',
 'result',
 'death_toll',
 'figure',
 'more',
 'authority',
 'england',
 'scotland_wales',
 'obtain_freedom',
 'information_request',
 'see',
 'observer',
 'show',
 'average',
 'rise',
 'fee',
 'year',
 '£',
 'rate_inflation',
 'big',
 'increase',
 'impose',
 'trafford',
 'council',
 'hike',
 '£',
 '£',
 '£',
 'birmingham_city',
 'council',
 'uk',
 'big',
 'local_authority',
 'make',
 'large',
 'surplus',
 'cremation_burial',
 'total_£',
 'm',
 'charge',
 '£',
 'cremation',
 'extra',
 '£',
 'funeral',
 'overrun',
 'time',
 'worthe',
 'most_expensive',
 'council',
 'provide',
 'datum',
 'cremation',
 'charge',
 '£',
 'increase',
 '£',
 'low',
 'charge',
 '£',
 'south_west',
 'middlesex',
 'council',
 'say',
 'surplus',
 'help',
 'recoup_cost',


In [158]:
winner = av_run(votes)

[(0, 9)]
Winner in round 1. The winner is 0


In [142]:
for ballot in winner:
    print(words[ballot[0]]) 
    

average_price


In [143]:
doc_arr = np.array(articles)
doc_arr_trimmed = doc_arr[:len(t)]
docs_in_class = doc_arr_trimmed[t == 2]

words = []
for doc in docs_in_class:
    words += list(np.unique(doc.words))
top_words = Counter(words).most_common()


In [144]:
doc_ids

['society/2020/jul/25/grieving-families-pushed-into-debt-as-costs-soar-for-burials-and-cremations',
 'business/2020/may/11/mourners-choosing-simplified-funerals-during-covid-19-crisis',
 'business/2020/apr/24/funeral-homes-push-for-state-help-as-lockdown-leads-to-no-frills-services',
 'business/2020/mar/11/dignity-delays-low-cost-funeral-plan-until-after-competition-report',
 'society/2020/jan/26/church-of-england-could-seek-end-paupers-funerals',
 'society/2020/jan/06/cost-of-dying-at-record-high-as-price-of-uk-funeral-exceeds-4400',
 'australia-news/2019/aug/01/funeral-homes-investigation-reveals-high-prices-and-unexplained-charges',
 'business/2019/may/13/funeral-provider-dignity-warns-fall-in-number-of-deaths-will-hit-profits',
 'business/2019/mar/28/competition-watchdog-to-investigate-funeral-sector-as-prices-escalate-cma']

In [170]:
model.wv.similarity('magnus_carlsen', 'chess')

0.6347216

In [2]:
from services.theme_extractor.article_preprocess_job import ArticlePreprocessJob

apj = ArticlePreprocessJob()

raw_articles = apj.get_articles_for_latest_load()[:10000]

processed_articles = apj.preprocess_raw_articles(raw_articles)

In [23]:
from gensim.models.phrases import Phrases
from typing import List
import numpy as np

tokenized_texts = apj.preprocessor.preprocessed_docs


def get_ngrams(tokenized_texts: List[List[str]], phrases: Phrases):
    return np.unique([token for token in sum([phrases[phrases[tokens]] for tokens in tokenized_texts ], []) if '_' in token])

def comp_phrasers():
    
    phrases1 = Phrases(tokenized_texts)

    phrases2 = Phrases(tokenized_texts, scoring='npmi', threshold=0.5, min_count=50)
        
    tokens1 = get_ngrams(tokenized_texts, phrases1)
    tokens2 = get_ngrams(tokenized_texts, phrases2)

    
    in1butnot2 = np.setdiff1d(tokens1, tokens2, assume_unique=True)
    in2butnot1 = np.setdiff1d(tokens2, tokens1, assume_unique=True)

    return tokens1, tokens2, in1butnot2, in2butnot1

tokens1, tokens2, unique_to_1, unique_to_2 = comp_phrasers()

In [24]:
print(len(unique_to_1))
for token in tokens2:
    print(token)

2030
anti_-
as_well
chief_executive
climate_change
comic_strip
donald_trump
executive_order
human_right
last_week
last_year
majority_country
muslim_majority
new_york
non_-
prime_minister
supreme_court
tell_guardian
theresa_may
travel_ban
united_states
white_house
£_m


In [40]:
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['ner', 'parser'])
from bs4 import BeautifulSoup

def extract_text_from_html(res: str) -> str:
    soup = BeautifulSoup(res, features="lxml")
    
    for f in soup.find_all('figure'):
        f.decompose()
    
    text = soup.get_text().lower();
    
    return text

text = extract_text_from_html(raw_articles[0].body)

In [42]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']
vals = [token.lemma_ for token in nlp(text) if token.pos_ in allowed_postags and len(token.lemma_) > 1]

['scotland',
 'yard',
 'investigate',
 'claim',
 'worker',
 'outsource',
 'firm',
 'caput',
 'pay',
 'convict',
 'deliberately',
 'fit',
 'electronic',
 'ankle',
 'tag',
 'loosely',
 'allow',
 'slip',
 'device',
 'when',
 'want',
 'go',
 'staff',
 'company',
 'run',
 'government',
 'electronic',
 'monitoring',
 'service',
 'allegedly',
 'pay',
 'time',
 'help',
 'at',
 'least',
 'offender',
 'beat',
 'court',
 'impose',
 'curfew',
 'accord',
 'report',
 'sun',
 'metropolitan',
 'police',
 'say',
 'investigation',
 'centre',
 'london',
 'borough',
 'newham',
 'say',
 'people',
 'include',
 'current',
 'former',
 'ems',
 'worker',
 'arrest',
 'connection',
 'offence',
 'involve',
 'monitoring',
 'offender',
 'accord',
 'sun',
 'scheme',
 'reveal',
 'offender',
 'arrest',
 'suspicion',
 'attempt',
 'murder',
 'suppose',
 'home',
 'curfew',
 'electronic',
 'tag',
 'use',
 'monitor',
 'condition',
 'court',
 'prison',
 'order',
 'usually',
 'securely',
 'attach',
 'ankle',
 'defender',
 'ca