In [1]:
import path
with path.Path('..'):
    from services.theme_extractor.preprocessing import ArticlePreprocessJob, ArticlePreprocessor 

    from services.theme_extractor.wv_model import  WVModelBuilder

    from services.theme_extractor.clustering import Clusterer

    from services.theme_extractor.keyword_extraction import KeywordExtractor

    from services.libs.data_model import ProcessedArticle, Theme

In [2]:

apj = ArticlePreprocessJob()

load_id = apj.get_latest_article_load().id
print(load_id)

dfd2c33b-bdb8-456d-bdff-b1e1ab014309


In [35]:
articles = apj.get_articles_for_load(load_id, max_articles=10000)
len(articles)

10000

In [21]:
from typing import List
from gensim.models import Doc2Vec
import numpy as np

class APRun:

    processed_articles: List[ProcessedArticle]

    model: Doc2Vec

    labels: np.array

    clusters: List[Theme]

    name: str

    def __init__(self, name, steps=['lemmatize', 'postag', 'phrasing'], postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
        self.name = name
        self.steps = steps;
        self.postags = postags;
        # self.processed_articles = processed_articles
        # self.model = model
        # self.labels = labels
        # self.clusters = clusters

    

In [78]:
def run_ap_options(run_options: APRun):

    load_id = run_options.name

    ap = ArticlePreprocessor(steps=run_options.steps, allowed_postags=run_options.postags)
    processed_articles = ap.preprocess_articles(articles, load_id)

    model_builder = WVModelBuilder()

    model = model_builder.build_wv_model(processed_articles)

    labels = Clusterer(model, processed_articles, load_id).create_mapping(min_cluster_size=3, cluster_selection_epsilon=0.1)
    
    clusters = KeywordExtractor(model).create_themes(load_id, processed_articles, labels)

    return processed_articles, model, labels, clusters

def calculate_runs():

    runs: List[APRun] = [
        APRun('run_with_all'),
        APRun('run_with_none', steps=[]),
        APRun('run_with_no_phrasing', steps=['postag', 'lemmatize']),
        APRun('run_with_no_lemmatize', steps=['postag', 'phrasing']),
        APRun('run_with_no_postag', steps=['lemmatize', 'phrasing'])

    ]

    for run in runs:
        processed_articles, model, labels, clusters = run_ap_options(run);
        run.processed_articles = processed_articles
        run.model = model
        run.labels = labels
        run.clusters = clusters

    return runs
    
ap_runs = calculate_runs()

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='cosine',
     metric_kwds=None, min_dist=0.1, n_components=100, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=666,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Sat Sep 19 19:11:32 2020 Finding Nearest Neighbors
Sat Sep 19 19:11:32 2020 Building RP forest with 10 trees
Sat Sep 19 19:11:33 2020 NN descent for 13 iterations
	 0  /  13
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
Sat Sep 19 19:11:39 2020 Finished Nearest Neighbor Search
Sat Sep 19 19:11:40 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  500 epochs
	c

In [74]:
from collections import Counter

counts = Counter(ap_runs[0].labels)

for i in [1, 14, 21, 500, 600]:
    cluster = ap_runs[0].clusters[i]
    print(cluster.id)
    print(cluster.name)
    print(counts[cluster.id])
    print(cluster.theme_words)

0
off pension
21
['fall flat pension row', 'showdown Macron pension', 'Macron pension', 'France brace gilet', 'magic end standoff pension', 'Paris pension row deepen', 'standoff pension', 'cut power pension', 'presidential pension']
13
Vaporfly shoe
5
['revolutionise marathon', 'Nike Vaporfly shoe', 'Nike shoe', 'Nike Vaporflys', 'Nike shoe revolutionise', 'Nike Vaporfly', 'shoe regulation', 'Nike Vaporflys escape ban', 'Vaporflys escape ban running']
20
Cain emotionally
5
['callous insensitive pressure female', 'insensitive pressure female', 'stage protest', 'sorry callous insensitive pressure', 'physical abuse Oregon', 'callous insensitive pressure', 'pressure female', 'insensitive pressure', 'Salazar say Mo']
499
define extinction rebellion week long
3
['xr autumn', 'xr autumn uprise', 'section order xr', 'say xr', 'met embroil grow controversy', 'say xr autumn', 'metropolitan police sadiq extremely concerned', 'xr must now cease protest', 'bring behalf xr']
599
brand Mulberry
9
['D

In [50]:
def jaccard(set1: np.array, set2: np.array):
    return len(np.intersect1d(set1, set2)) / len(np.union1d(set1, set2))

In [61]:
class ThemeTarget:

    def __init__(self, label: int, name: str):
        self.label = label
        self.name = name

In [87]:
import pandas as pd
from IPython.display import display, HTML

targets: List[ThemeTarget] = [
    ThemeTarget(0, 'Grenfell'),
    ThemeTarget(13, 'Coronavirus students'),
    ThemeTarget(20, 'Shoes'),
    ThemeTarget(399, 'Suleimani'),
    ThemeTarget(599, 'Labour Manifesto'),
]



In [130]:
def get_jaccards(arts):
    jaccard_data = {}
    for run in ap_runs:
        jaccard_data[run.name] = {}
        for other_run in ap_runs:
            jaccard_data[run.name][other_run.name] = {}
            for cluster in run.clusters:
                jaccard_score = 1
                if run != other_run:
                    arr = Counter(other_run.labels[run.labels == cluster.id]).most_common(2)
                    if arr[0][0] != -1:
                        lab = arr[0][0]
                        jaccard_score = jaccard(arts[run.labels == cluster.id], arts[other_run.labels == lab])
                    elif len(arr) > 1:
                        lab = arr[1][0]
                        jaccard_score = jaccard(arts[run.labels == cluster.id], arts[other_run.labels == lab])
                    else:
                        jaccard_score = 0

                jaccard_data[run.name][other_run.name][cluster.id] = jaccard_score
    return jaccard_data

                    

In [131]:
jacc_data = get_jaccards(np.array([art.id for art in articles]))


['us-news/2019/oct/29/family-of-harry-dunn-announce-plan-to-sue-trump-administration'
 'uk-news/2019/oct/27/harry-dunns-twin-brother-appeals-to-anne-sacoolas-to-return-to-uk'
 'politics/2019/oct/17/no-10-denies-boris-johnson-asked-donald-trump-set-up-anne-sacoolas-meeting'
 'politics/2019/oct/16/harry-dunn-parents-vow-to-continue-fight-for-justice-after-trump-meeting-sacoolas'
 'us-news/2019/oct/15/harry-dunn-parents-trump-white-house-anne-sacoolas'
 'uk-news/2019/oct/14/harry-dunns-parents-say-they-will-only-meet-anne-sacoolas-if-she-returns-to-uk'
 'uk-news/2019/oct/13/harry-dunn-parents-to-meet-anne-sacoolas-as-immunity-row-continues'
 'uk-news/2019/oct/11/harry-dunn-family-flying-us-seek-justice-son-diplomats-wife'
 'uk-news/2019/oct/09/harry-dunn-parents-civil-action-anne-sacoolas'
 'politics/2019/oct/08/dominic-raab-urges-us-to-reconsider-anne-sacoolas-immunity'
 'uk-news/2019/oct/07/harry-dunn-crash-pm-to-raise-anne-sacoolas-case-with-white-house'
 'uk-news/2019/oct/06/all-hope-

In [124]:
jacc_data['run_with_all']['run_with_none']

{-1: 0.0,
 0: 0.0,
 1: 0.0,
 2: 0.0,
 3: 0.0,
 4: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 10: 0.0,
 11: 0.0,
 12: 0.0,
 13: 0.0,
 14: 0.0,
 15: 0.0,
 16: 0.0,
 17: 0.0,
 18: 0.0,
 19: 0.0,
 20: 0.0,
 21: 0.0,
 22: 0.0,
 23: 0.0,
 24: 0.0,
 25: 0.0,
 26: 0.0,
 27: 0.0,
 28: 0.0,
 29: 0.0,
 30: 0.0,
 31: 0.0,
 32: 0.0,
 33: 0.0,
 34: 0.0,
 35: 0.0,
 36: 0.0,
 37: 0.0,
 38: 0.0,
 39: 0.0,
 40: 0.0,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0,
 50: 0.0,
 51: 0.0,
 52: 0.0,
 53: 0.0,
 54: 0.0,
 55: 0.0,
 56: 0.0,
 57: 0.0,
 58: 0.0,
 59: 0.0,
 60: 0.0,
 61: 0.0,
 62: 0.0,
 63: 0.0,
 64: 0.0,
 65: 0.0,
 66: 0.0,
 67: 0.0,
 68: 1.0,
 69: 0.0,
 70: 0.0,
 71: 0.0,
 72: 0.0,
 73: 0.0,
 74: 0,
 75: 0.0,
 76: 0.0,
 77: 0.0,
 78: 0.0,
 79: 0.0,
 80: 0.0,
 81: 0.0,
 82: 0.0,
 83: 0.0,
 84: 0.0,
 85: 0.0,
 86: 0.0,
 87: 0.0,
 88: 0.0,
 89: 0.0,
 90: 0.0,
 91: 0.0,
 92: 0.0,
 93: 0.0,
 94: 0.0,
 95: 0.0,
 96: 0.0,
 97: 0.0,
 98: 0.0,
 99: 0.0,
 1

In [101]:
for target in targets:

    label_id = target.label
    mapping = ap_runs[0].labels
    mapping_articles_idx = np.where(mapping == label_id)
    base_articles = [a.id for a in np.array(articles)[mapping_articles_idx]]

    data = {}

    for i, run in enumerate(ap_runs):
        cluster_match = Counter(run.labels[mapping_articles_idx]).most_common();
        main_cluster = cluster_match[0][0]
        clus = [c for c in run.clusters if c.id == main_cluster][0]
        print(cluster_match)

        theme_articles = [a.id for a in np.array(articles)[np.where(run.labels == main_cluster)]]
        print(jaccard(theme_articles, base_articles))
        data[run.name] = [[c.name] + c.theme_words for c in run.clusters if c.id == main_cluster][0]

    display(HTML(pd.DataFrame(data).to_html()))

[(0, 13)]
1.0
[(2, 13)]
1.0
[(0, 13)]
1.0
[(0, 13)]
1.0
[(72, 13)]
1.0


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,Sacoolas immunity,as immunity,Sacoolas immunity row,Dunn twin,’s family to
1,Harry Dunn twin,immunity for envoy,immunity row,Sacoolas immunity,Dunn ’s family
2,family sue US government,immunity for,lose immunity,immunity envoy wife,immunity for envoy 's
3,immunity row,immunity for envoy 's,wife will,tearful account find,for -PRON- son
4,immunity envoy,family to sue,immunity envoy wife,Harry Dunn twin,’s family to travel to US
5,wife leave,ask us to waive,wife will return,son US TV,Dunn ’s family to
6,meet Anne Sacoolas immunity,as immunity row,US seek justice,Harry Dunn,Dunn 's twin
7,Dunn twin,wife in,immunity row continue,Harry Dunn family,immunity for
8,family sue US,family to travel to,family travel,meet Anne Sacoolas immunity,immunity for envoy
9,wife will return,’s family,tearful account find,next room,as immunity


[(13, 5)]
1.0
[(57, 4), (-1, 1)]
0.21052631578947367
[(45, 5)]
0.22727272727272727
[(29, 4), (30, 1)]
0.18181818181818182
[(22, 5)]
0.20833333333333334


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,Bury FC despair club,and football regulation,Bury CVA,Bury CVA,takeover go through without full
1,FC despair club expel,CVA and football regulation,club expel Football,FC despair club expel,club should bail
2,Distressed Bury,as club,Bury say,Bury season opener MK,Bury on the brink of
3,chain drainpipe save club,club be,Bury Tuesday,despair club expel Football,Bury takeover go through without
4,club should bail,takeover to go through,football regulation,Distressed Bury,takeover go through without
5,despair club,and football,CVA football regulation,Bury Tuesday,spotlight over Pastore
6,drainpipe save club,’s expulsion,Bury FC,Bury FC despair club,fear over Bury
7,FC despair club,EFL over club ’s,chain drainpipe save club,Bury FC,spotlight over
8,despair club expel Football,Campbell takeover to,MK don,Bury fan,call for inquiry into CVA and
9,save club,Bury CVA prompt insolvency,despair club expel Football,drainpipe save club,block from


[(20, 9)]
1.0
[(41, 9)]
0.21951219512195122
[(21, 9)]
0.6
[(16, 9)]
0.6428571428571429
[(35, 9)]
0.8181818181818182


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,dope say,athlete to,order athlete,Salazar dope,’s athlete
1,Salazar dope,on dope,athlete sever link Alberto,dope say,athlete to
2,Salazar athlete,order athlete to sever,order athlete sever,Salazar athlete,athlete to sever
3,athlete sever link Alberto,’s athlete,athlete sever link,Salazar insist never mislead,athlete to sever all
4,order athlete,"on dope ,",Sebastian Coe order athlete,Mo Farah turn medium,on dope
5,Coe order athlete,eager to keep streak,athlete sever,US sprinter,Salazar ’s athlete
6,order athlete sever link,olympic success after,Coe order athlete,sprinter Christian,Farah turn on
7,order athlete sever,reveal extent of,sever link Alberto Salazar,Sebastian Coe,which lead to Salazar
8,athlete sever,record - holder,Salazar athlete,lead Salazar,turn on
9,Coe order athlete sever,relay silver in,dope say,Alberto Salazar athlete,Alberto Salazar ’s athlete


[(399, 9)]
1.0
[(197, 6), (609, 2), (-1, 1)]
0.6
[(66, 6), (215, 3)]
0.42857142857142855
[(96, 8), (-1, 1)]
0.6153846153846154
[(227, 5), (213, 1), (521, 1), (-1, 1), (534, 1)]
0.35714285714285715


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,stimulus RBA,of stimulus,underspend NDIS,stimulus RBA,of stimulus
1,hint further cut interest,to justify effectiveness of,inequality craft good story,underspend NDIS,to stimulate
2,justify effectiveness interest rate,growth in,labor turn heat,economy heap,interest rate to
3,Deloitte say,to stimulate,budget effectively balance underspend,Labor Jim Chalmers say,economy be
4,further cut interest,economy be,stimulus RBA,heap stimulus,of 0.75
5,effectiveness interest rate,justify effectiveness,coalition income,further cut interest,"economic shock , report find"
6,shock report find,wage cap to stimulate,remove buffer economic shock,Chalmers say,wage cap to stimulate
7,warn Deloitte,growth in more than,tax cut remove buffer,hint further cut interest,"levy ,"
8,economic shock,to justify effectiveness,low boost weak economy,Reserve Bank,to justify effectiveness of
9,historic low boost,hint at,reference worsen wealth,justify effectiveness interest rate,growth in more than a decade


[(599, 3)]
1.0
[(526, 3)]
1.0
[(379, 3)]
0.6
[(465, 3)]
1.0
[(677, 3)]
1.0


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,morrison government power forward advertising,department reject the audit,advertising last financial year,advertising due complete month,spruike -PRON-
1,advertising framework,say the campaign,government advertising,morrison government power forward advertising,spend $ 18.5
2,anao calculate ad,on the campaign,advertising last financial,advertising year average anao,-PRON- spend $
3,advertising real measure,of the campaign .,blitz warning,advertising due complete,the may election
4,blitz christmas audit find,conroy and the,taxpayer fund government advertising,government advertising due complete,blitz spruike
5,forward campaign spruike,"power forward "" campaign",coalition spend,blitz spruike,conroy and the crossbencher
6,power forward advertising,campaign .,coalition spend pre election,ministership conroy,spend $ 14.1
7,blitz spruike,"campaign "" .",fund government advertising,blitz Christmas,annual report spend $
8,just federal election,campaign that the,staffer hour,campaign prove ineffective audit,-PRON- spend $ 18.5
9,effectively administer advertising,campaign in,taxpayer pay staffer hour,blitz warning,spend $ 100.1
