In [13]:
%load_ext Cython

In [1]:
# import services.theme_extractor_api.main
from services.theme_extractor.cluster_job import ClusterJob
from services.theme_extractor.wv_model_job import WVModelJob





In [2]:
wvm = WVModelJob()

al = wvm.get_latest_article_load()

model = wvm.get_model_from_disk(al.id)

cj = ClusterJob(model, al.id)

In [3]:
articles = cj.filter_articles()

In [4]:
from services.theme_extractor.clusterer import Clusterer


c = Clusterer(model, articles, al.id, from_scratch=True, min_cluster_size=3, cluster_selection_epsilon=0.1)


In [5]:
mapping = c.create_mapping()

In [6]:
import numpy as np
import en_core_web_sm

articles_in_class = np.array(articles)[:len(mapping)][mapping == 1]


titles = [art.title_words for art in articles_in_class]

vecs = list([model.docvecs[doc.id] for doc in articles_in_class])

In [7]:
import pandas as pd
def test_word_extraction_method(docs, method):

    keywords = method(docs);
    return keywords



In [14]:
%%cython

from typing import List
def generate_cngrams(words_list: List[str], n: int):
    ngrams_list = []
 
    for num in range(0, len(words_list) - (n - 1)):
        ngram = (words_list[num:num + n])
        ngrams_list.append(ngram)
 
    return ngrams_list

In [15]:
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

import math
import itertools
import statistics as s

from collections import Counter


def extract_ngram_keywords(articles_in_class):
    n_grams = []

    for i, art in enumerate(articles_in_class):
        words =  art.title_words if len(articles_in_class) > 5 else art.words;
        # n_grams += [[word] for word in words]
        n_grams += generate_cngrams(words, 2)
        n_grams += generate_cngrams(words, 3)
        n_grams += generate_cngrams(words, 4)


    print('n_grams obtained - there are {}'.format(len(n_grams)))

    n_grams.sort()
    n_grams = list(n_grams for n_grams,_ in itertools.groupby(n_grams))

    print('dupes removed - there are now {}'.format(len(n_grams)))

    scores = {}
    docvecs = np.array(vecs)

    p_vecs = [model.infer_vector(n_gram, steps=10) for n_gram in n_grams]
    p_vecs_arr = np.array(p_vecs).reshape(len(p_vecs), 400)
    sims = np.mean(cosine_similarity(p_vecs_arr, docvecs), axis=1)

    scores = dict(enumerate(sims))
    
    def convert_ngram_to_string(ngram: List[str]):
        return " ".join(ngram).replace("_", " ")

    return [convert_ngram_to_string(n_grams[p[0]]) for p in Counter(scores).most_common(10)]
    


In [10]:
a = np.array([
    [1 ,2 ,3],
    [3, 2, 1]
])

b = np.array([
    [1 ,2 ,3],
    [3, 2, 1],
    [2, 4, 6]
])

sim = cosine_similarity(a,b)

np.mean(sim, axis = 1)

array([0.9047619 , 0.80952381])

In [11]:
def get_most_common_words(articles_in_class):

    words = sum([art.words for art in articles_in_class], [])

    return [w[0] for w in Counter(words).most_common(10)]

def get_most_common_title_words(articles_in_class):

    words = sum([art.title_words for art in articles_in_class], [])

    return [w[0] for w in Counter(words).most_common(10)]

In [12]:
from gensim.summarization import keywords

def textrank_vanilla_blob(articles_in_class):
    words = []
    for article in articles_in_class:
        words += article.words
        
        
    
    return keywords(" ".join(words), split=True)[:10]

def textrank_vanilla(articles_in_class):
    kwords = []
    for article in articles_in_class:
        kwords += keywords(" ".join(article.words), split=True)
    
    return [w[0] for w in Counter(kwords).most_common(10)]

In [16]:
from IPython.display import display, HTML



def get_performance_for_keyword_extraction(name, keyword_extraction_method):

    descs = [
        'Boxing News',
        'Home Office English Tests Scandal',
        'Littering in the UK',
        'US Water Shutoffs',
        'South Korea Election 2020'
    ]

    data = {}

    for i, theme_id in enumerate([1, 10, 20, 64, 95]):
        articles_in_class = np.array(articles)[:len(mapping)][mapping == theme_id]
        desc = descs[i]
        data[desc] = keyword_extraction_method(articles_in_class)

    
    df = pd.DataFrame(data)

    display(HTML(df.to_html()))


methods = [
    ('ngram method', extract_ngram_keywords),
    ('common words', get_most_common_words),
    ('common words (from title)', get_most_common_title_words),
    ('textrank + vote', textrank_vanilla),
    ('textrank over all articles', textrank_vanilla_blob)
]

for name, meth in methods:
    get_performance_for_keyword_extraction(name, meth);
        
        # print([art.title for art in articles_in_class][:10])

n_grams obtained - there are 807
dupes removed - there are now 757
n_grams obtained - there are 138
dupes removed - there are now 132
n_grams obtained - there are 3591
dupes removed - there are now 3484
n_grams obtained - there are 114
dupes removed - there are now 108
n_grams obtained - there are 2682
dupes removed - there are now 2394


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,Andy Ruiz Jr rematch,english language test,employ company,moratorium end,testing monitoring
1,WBC heavyweight title,student accuse cheat,defra spokeswoman say,water bill rise,monitoring social distancing strict quarantine
2,boxer Maxim Dadashev,MPs hold inquiry,represent more half council england wales,clean water shutoff,check temperature
3,Joshua Ruiz Jr rematch,english test cheat,fly tipping penalty increase,layoff trigger,medical worker
4,boxer Maxim,act immigration,resource waste,million Americans,majority national assembly
5,fight Nurmagomedov Diaz Mayweather,hold inquiry,harm pet wildlife,million US,worry so be sure
6,Logan Paul boxing,face legal action,waste clearance,water shutoff,fall mortality rate
7,fight Kubrat Pulev,english test,local government association say council,trigger pandemic,accord korea
8,rematch confirm,should replace,tough guideline issue,bill rise,monitoring social distancing strict
9,fight Nurmagomedov Diaz,english test cheat claim,effective punitive deterrent,shutoff layoff trigger,win seat assembly


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight,home_office,council,water,south_korea
1,say,student,fine,bill,party
2,joshua,test,issue,shutoff,say
3,will,say,litter,city,election
4,go,cheat,say,say,moon
5,fury,uk,more,detroit,outbreak
6,when,english,also,household,polling_station
7,would,people,fly_tipping,resident,hold
8,fighter,study,fly,people,vote
9,ruiz,government,increase,running_water,voter


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,Anthony_Joshua,test,council,water,South_Korea
1,Ruiz,Home_Office,litter,shutoff,rule
2,Andy,English,unpunishe,million,party
3,Jr,scandal,many,US,win_election
4,fight,cheat,England_Wales,Detroit,landslide
5,Tyson,student,tough_penalty,pandemic,coronavirus_outbreak
6,Fury,MPs,need,suspend,south_korean
7,rematch,UK,curb,face,voter
8,Eddie,face,surge,lose,expect
9,Hearn,rush,fly,water_supply,return


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight,student,government,people,moon
1,boxing,testing,average,pandemic,coronavirus
2,joshua,people,councillor,month,glove
3,time,visa,use,running_water,vote
4,want,uk_government,council,family,party
5,ruiz,year,year,department,south_korea
6,round,fraud,sentence,water,elect
7,box,accusation,need,more_people,hold
8,look,questionable,fine week litterer,utility,country
9,know,cheating,council issue,work,wear_mask


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight_back,home_office,council issue,water,voting
1,fighting,student,littering,city,south_korea rule party
2,fights,cheat,fine week litterer,people,voter
3,confirm fight world,government,last_year,many_people,vote about_people
4,ruiz_jr,uk_government,relate litter,least_people,elect
5,box right_now,organise cheating,fly_tipping,pandemic,coronavirus
6,time,last_year,year enforcement,detroiter result,polling_station
7,time_when,almost_year,increase,resident,national_assembly election
8,times,people,increase_amount,pay,country
9,timing,hundred_people,more_people,afford,other_country


In [135]:


a =  np.ones(20)
len(cartesian_product(a, a, a).tolist())

8000

In [31]:
from collections import Counter
from scipy import spatial
from gensim.models import Doc2Vec
import numpy as np
from typing import List
import os

from services.libs.data_model.processed_article import ProcessedArticle

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram

from umap import UMAP
from hdbscan import HDBSCAN

from  services.theme_extractor.logger import logger



from services.libs.data_model.theme import Theme
from services.libs.data_model.article import Article
        
def __get_class_words_from_doc_selection(docs_in_class: List[ProcessedArticle], vecs, model: Doc2Vec):
        doc_dict = {}

        for doc in docs_in_class:
            for word in doc.words:
                if word in doc_dict:
                    doc_dict[word] += 1
                else:
                    doc_dict[word] = 1

        d = Counter(doc_dict)

        top_words = d.most_common(1000)

        word_2_vec_ranking = {}

        for word in top_words:
            
            if(word[0] not in model.wv.vocab):
                continue;
            
            word_vec = model[word[0]]
            av_vec = np.average(vecs, axis=0)


            similarity = 1 - spatial.distance.cosine(word_vec, av_vec)
            word_2_vec_ranking[word[0]] = similarity

        rank_counter = Counter(word_2_vec_ranking)

        return [w[0] for w in rank_counter.most_common(200)]

In [6]:
doc_arr = np.array(articles)
doc_arr_trimmed = doc_arr[:len(t)]
docs_in_class = doc_arr_trimmed[t == 0]
vecs = list([model.docvecs[doc.id] for doc in docs_in_class])

NameError: name 'np' is not defined

In [86]:
title_words = sum([[w.lemma_ for w in nlp(doc.title) if w.pos_ in ['NOUN', 'VERB',  'PROPN']] for doc in docs_in_class], [])
top_word = Counter(title_words).most_common(1)[0][0]
top_word

'chess'

In [63]:
weighted = np.array([model.wv.get_vector(word) for word in theme_words])
counts = np.reshape(np.array([model.wv.vocab[word].count  for word in theme_words]), (200, 1))
print(np.array([(word, model.wv.vocab[word].count) for word in theme_words]))

[['levon_aronian' '86']
 ['nf3_nc6' '57']
 ['e4_e5' '61']
 ['wesley_so' '70']
 ['anish_giri' '55']
 ['sergey_karjakin' '105']
 ['teimour' '14']
 ['radjabov' '22']
 ['luke_mcshane' '58']
 ['vishy_anand' '94']
 ['wijk' '73']
 ['bc4' '43']
 ['aronian' '63']
 ['dubov' '41']
 ['mamedyarov' '67']
 ['bc5' '39']
 ['lagrave' '83']
 ['relate_chess' '133']
 ['ding_liren' '66']
 ['lagno' '10']
 ['bb5' '54']
 ['karpov' '48']
 ['d3' '59']
 ['fabiano_caruana' '193']
 ['grenke' '26']
 ['caruana' '321']
 ['shamkir' '33']
 ['nakamura' '99']
 ['firouzja' '89']
 ['garry_kasparov' '113']
 ['maxime_vachi' '69']
 ['nf6' '145']
 ['nf3' '155']
 ['e6' '122']
 ['nc3' '165']
 ['magnus_carlsen' '355']
 ['artemiev' '16']
 ['d4' '218']
 ['hikaru_nakamura' '91']
 ['bd4' '19']
 ['vlad_kramnik' '61']
 ['bb4' '47']
 ['gawain_jones' '83']
 ['c6' '76']
 ['d6' '106']
 ['d5' '192']
 ['shak' '45']
 ['e5' '141']
 ['exd5' '64']
 ['vachier' '14']
 ['nd5' '59']
 ['f4' '98']
 ['bobby_fischer' '79']
 ['e4' '224']
 ['f5' '80']
 ['d

In [53]:
weighted_counts = weighted * counts
print(weighted_counts.shape)

(200, 400)


In [78]:
[(w.lemma_, w.pos_) for w in nlp(" ".join(theme_words))]

[('levon_aronian', 'PROPN'),
 ('nf3_nc6', 'PROPN'),
 ('e4_e5', 'PROPN'),
 ('wesley_so', 'ADV'),
 ('anish_giri', 'ADV'),
 ('sergey_karjakin', 'PROPN'),
 ('teimour', 'NOUN'),
 ('radjabov', 'PROPN'),
 ('luke_mcshane', 'PROPN'),
 ('vishy_anand', 'PROPN'),
 ('wijk', 'PROPN'),
 ('bc4', 'PROPN'),
 ('aronian', 'ADJ'),
 ('dubov', 'NOUN'),
 ('mamedyarov', 'PROPN'),
 ('bc5', 'PROPN'),
 ('lagrave', 'NOUN'),
 ('relate_chess', 'PROPN'),
 ('ding_liren', 'PROPN'),
 ('lagno', 'NOUN'),
 ('bb5', 'PROPN'),
 ('karpov', 'PROPN'),
 ('d3', 'PROPN'),
 ('fabiano_caruana', 'PROPN'),
 ('grenke', 'PROPN'),
 ('caruana', 'PROPN'),
 ('shamkir', 'PROPN'),
 ('nakamura', 'PROPN'),
 ('firouzja', 'PROPN'),
 ('garry_kasparov', 'PROPN'),
 ('maxime_vachi', 'PROPN'),
 ('nf6', 'PROPN'),
 ('nf3', 'PROPN'),
 ('e6', 'PROPN'),
 ('nc3', 'PROPN'),
 ('magnus_carlsen', 'PROPN'),
 ('artemiev', 'PROPN'),
 ('d4', 'NOUN'),
 ('hikaru_nakamura', 'ADP'),
 ('bd4', 'PROPN'),
 ('vlad_kramnik', 'X'),
 ('bb4', 'X'),
 ('gawain_jone', 'NOUN'),
 ('c

In [6]:
import numpy as np
from collections import Counter


''
## votes
''
def av_run(votes: np.array):
    
    # Expect votes to be a 2D numpy array where each row is a ballot and each column represents a candidate. 
    # The value of the cell represents the preference order of that voter - lower = preference!

    n_ballots, n_candidates = votes.shape

    running=True

    n_round = 1

    losers = set()

    while(running):


        winners = []

        for ballot in votes:
            winners.append(np.where(ballot == np.amin(ballot))[0][0])

        vote_counts = Counter(winners).most_common()


        print(vote_counts)
        if vote_counts[0][1] > n_ballots / 2:
            print('Winner in round {}. The winner is {}'.format(n_round, vote_counts[0][0]))
            running = False
            return vote_counts
        else:
            
            loser = next(x[0] for x in reversed(vote_counts) if x[0] not in losers)
            votes[:, loser] = n_candidates + 1
            print('No winner in round {}. Loser was {}.'.format(n_round, loser))


        if n_round > n_candidates:
            print('Err! cancelling')        

    

In [160]:

from math import log

def get_doc_word_votes(doc_ids: List[str], words: list, model: Doc2Vec, coeff=1):
    
    votes = []
    

    for doc_id in doc_ids:
        vec = model.docvecs[doc_id];

        sims: list = []

        for other_word, cnt in words:
            other_vec =  model.wv.get_vector(other_word);
            sim = abs(model.wv.cosine_similarities(vec, [other_vec])) * (coeff * cnt)

            sims.append(sim) 
            

        
        sims_ordered = sorted(sims, reverse=True)
        sim_indexes = list([sims_ordered.index(sim) for sim in sims])
        votes.append(sim_indexes)
        

    return np.array(votes)



In [152]:
doc_arr = np.array(articles)
doc_arr_trimmed = doc_arr[:len(t)]
docs_in_class = doc_arr_trimmed[t == 3]
vecs = list([model.docvecs[doc.id] for doc in docs_in_class])

In [153]:
def get_words(model, doc_ids):
    words_union = []
    for doc_id in doc_ids:
        docvec = model.docvecs[doc_id]
        words_union += [wc[0] for wc in model.wv.similar_by_vector(docvec, topn=100, restrict_vocab=10000)]
    return [wc for wc in Counter(words_union)]

In [157]:
doc_ids = [doc.id for doc in docs_in_class]
print(len(docs_in_class))
words = get_words(model, doc_ids)
words_with_counts = [(word, model.wv.vocab[word].count) for word in words]
votes = get_doc_word_votes(doc_ids,words_with_counts, model, 0)

9


In [166]:
docs_in_class[0].words

['family',
 'struggle',
 'pay',
 'funeral',
 'face',
 'rise',
 'price',
 'country',
 'help',
 'council',
 'net',
 '£',
 'surplus',
 'cremation',
 'cemetery_burial',
 'observer',
 'learn',
 'surplus',
 'expect',
 'rise',
 'significantly',
 'year',
 'result',
 'death_toll',
 'figure',
 'more',
 'authority',
 'england',
 'scotland_wales',
 'obtain_freedom',
 'information_request',
 'see',
 'observer',
 'show',
 'average',
 'rise',
 'fee',
 'year',
 '£',
 'rate_inflation',
 'big',
 'increase',
 'impose',
 'trafford',
 'council',
 'hike',
 '£',
 '£',
 '£',
 'birmingham_city',
 'council',
 'uk',
 'big',
 'local_authority',
 'make',
 'large',
 'surplus',
 'cremation_burial',
 'total_£',
 'm',
 'charge',
 '£',
 'cremation',
 'extra',
 '£',
 'funeral',
 'overrun',
 'time',
 'worthe',
 'most_expensive',
 'council',
 'provide',
 'datum',
 'cremation',
 'charge',
 '£',
 'increase',
 '£',
 'low',
 'charge',
 '£',
 'south_west',
 'middlesex',
 'council',
 'say',
 'surplus',
 'help',
 'recoup_cost',


In [158]:
winner = av_run(votes)

[(0, 9)]
Winner in round 1. The winner is 0


In [142]:
for ballot in winner:
    print(words[ballot[0]]) 
    

average_price


In [143]:
doc_arr = np.array(articles)
doc_arr_trimmed = doc_arr[:len(t)]
docs_in_class = doc_arr_trimmed[t == 2]

words = []
for doc in docs_in_class:
    words += list(np.unique(doc.words))
top_words = Counter(words).most_common()


In [144]:
doc_ids

['society/2020/jul/25/grieving-families-pushed-into-debt-as-costs-soar-for-burials-and-cremations',
 'business/2020/may/11/mourners-choosing-simplified-funerals-during-covid-19-crisis',
 'business/2020/apr/24/funeral-homes-push-for-state-help-as-lockdown-leads-to-no-frills-services',
 'business/2020/mar/11/dignity-delays-low-cost-funeral-plan-until-after-competition-report',
 'society/2020/jan/26/church-of-england-could-seek-end-paupers-funerals',
 'society/2020/jan/06/cost-of-dying-at-record-high-as-price-of-uk-funeral-exceeds-4400',
 'australia-news/2019/aug/01/funeral-homes-investigation-reveals-high-prices-and-unexplained-charges',
 'business/2019/may/13/funeral-provider-dignity-warns-fall-in-number-of-deaths-will-hit-profits',
 'business/2019/mar/28/competition-watchdog-to-investigate-funeral-sector-as-prices-escalate-cma']

In [170]:
model.wv.similarity('magnus_carlsen', 'chess')

0.6347216

In [2]:
from services.theme_extractor.article_preprocess_job import ArticlePreprocessJob

apj = ArticlePreprocessJob()

raw_articles = apj.get_articles_for_latest_load()[:10000]

processed_articles = apj.preprocess_raw_articles(raw_articles)

In [23]:
from gensim.models.phrases import Phrases
from typing import List
import numpy as np

tokenized_texts = apj.preprocessor.preprocessed_docs


def get_ngrams(tokenized_texts: List[List[str]], phrases: Phrases):
    return np.unique([token for token in sum([phrases[phrases[tokens]] for tokens in tokenized_texts ], []) if '_' in token])

def comp_phrasers():
    
    phrases1 = Phrases(tokenized_texts)

    phrases2 = Phrases(tokenized_texts, scoring='npmi', threshold=0.5, min_count=50)
        
    tokens1 = get_ngrams(tokenized_texts, phrases1)
    tokens2 = get_ngrams(tokenized_texts, phrases2)

    
    in1butnot2 = np.setdiff1d(tokens1, tokens2, assume_unique=True)
    in2butnot1 = np.setdiff1d(tokens2, tokens1, assume_unique=True)

    return tokens1, tokens2, in1butnot2, in2butnot1

tokens1, tokens2, unique_to_1, unique_to_2 = comp_phrasers()

In [24]:
print(len(unique_to_1))
for token in tokens2:
    print(token)

2030
anti_-
as_well
chief_executive
climate_change
comic_strip
donald_trump
executive_order
human_right
last_week
last_year
majority_country
muslim_majority
new_york
non_-
prime_minister
supreme_court
tell_guardian
theresa_may
travel_ban
united_states
white_house
£_m


In [40]:
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['ner', 'parser'])
from bs4 import BeautifulSoup

def extract_text_from_html(res: str) -> str:
    soup = BeautifulSoup(res, features="lxml")
    
    for f in soup.find_all('figure'):
        f.decompose()
    
    text = soup.get_text().lower();
    
    return text

text = extract_text_from_html(raw_articles[0].body)

In [42]:
allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']
vals = [token.lemma_ for token in nlp(text) if token.pos_ in allowed_postags and len(token.lemma_) > 1]

['scotland',
 'yard',
 'investigate',
 'claim',
 'worker',
 'outsource',
 'firm',
 'caput',
 'pay',
 'convict',
 'deliberately',
 'fit',
 'electronic',
 'ankle',
 'tag',
 'loosely',
 'allow',
 'slip',
 'device',
 'when',
 'want',
 'go',
 'staff',
 'company',
 'run',
 'government',
 'electronic',
 'monitoring',
 'service',
 'allegedly',
 'pay',
 'time',
 'help',
 'at',
 'least',
 'offender',
 'beat',
 'court',
 'impose',
 'curfew',
 'accord',
 'report',
 'sun',
 'metropolitan',
 'police',
 'say',
 'investigation',
 'centre',
 'london',
 'borough',
 'newham',
 'say',
 'people',
 'include',
 'current',
 'former',
 'ems',
 'worker',
 'arrest',
 'connection',
 'offence',
 'involve',
 'monitoring',
 'offender',
 'accord',
 'sun',
 'scheme',
 'reveal',
 'offender',
 'arrest',
 'suspicion',
 'attempt',
 'murder',
 'suppose',
 'home',
 'curfew',
 'electronic',
 'tag',
 'use',
 'monitor',
 'condition',
 'court',
 'prison',
 'order',
 'usually',
 'securely',
 'attach',
 'ankle',
 'defender',
 'ca

In [7]:
from gensim.models.phrases import Phrases

p = Phrases([['hello', 'world']])

In [9]:
p[['hello', 'world', 'caramel']]

['hello', 'world', 'caramel']