In [1]:
import path
with path.Path('..'):
    from services.theme_extractor.preprocessing import ArticlePreprocessJob, ArticlePreprocessor 

    from services.theme_extractor.wv_model import  WVModelBuilder

    from services.theme_extractor.clustering import Clusterer

    from services.theme_extractor.keyword_extraction import KeywordExtractor

    from services.libs.data_model import ProcessedArticle, Theme

In [2]:

apj = ArticlePreprocessJob()

load_id = apj.get_latest_article_load().id
print(load_id)

467130b3-fe75-4f1f-b2fa-4757e15f11fd


In [3]:
articles = apj.get_articles_for_load(load_id, max_articles=10000)
len(articles)

10000

In [4]:
from typing import List
from gensim.models import Doc2Vec
import numpy as np

class APRun:

    processed_articles: List[ProcessedArticle]

    model: Doc2Vec

    labels: np.array

    clusters: List[Theme]

    name: str

    def __init__(self, name, steps=['lemmatize', 'postag', 'phrasing'], postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
        self.name = name
        self.steps = steps;
        self.postags = postags;
        # self.processed_articles = processed_articles
        # self.model = model
        # self.labels = labels
        # self.clusters = clusters

    

In [5]:
def run_ap_options(run_options: APRun):

    load_id = run_options.name

    ap = ArticlePreprocessor(steps=run_options.steps, allowed_postags=run_options.postags)
    processed_articles = ap.preprocess_articles(articles, load_id)

    model_builder = WVModelBuilder()

    model = model_builder.build_wv_model(processed_articles)

    labels = Clusterer(model, processed_articles, load_id).create_mapping(min_cluster_size=3, cluster_selection_epsilon=0.1)
    
    clusters = KeywordExtractor(model).create_themes(load_id, processed_articles, labels)

    return processed_articles, model, labels, clusters

def calculate_runs():

    runs: List[APRun] = [
        APRun('run_with_all'),
        APRun('run_with_none', steps=[]),
        APRun('run_with_no_phrasing', steps=['postag', 'lemmatize']),
        APRun('run_with_no_lemmatize', steps=['postag', 'phrasing']),
        APRun('run_with_no_postag', steps=['lemmatize', 'phrasing'])

    ]

    for run in runs:
        processed_articles, model, labels, clusters = run_ap_options(run);
        run.processed_articles = processed_articles
        run.model = model
        run.labels = labels
        run.clusters = clusters
 
    return runs
    
ap_runs = calculate_runs()

Extracting keywords for 807 themes
Extracting keywords for 797 themes
Extracting keywords for 810 themes
Extracting keywords for 815 themes
Extracting keywords for 833 themes


In [6]:
from collections import Counter

counts = Counter(ap_runs[0].labels)

for i in [1, 14, 21, 500, 600]:
    cluster = ap_runs[0].clusters[i]
    print(cluster.id)
    print(cluster.name)
    print(counts[cluster.id])
    print(cluster.theme_words)

0
new coronavirus case fall low level
13
[&#39;volatile former ambassador warn&#39;, &#39;british musical&#39;, &#39;former ambassador&#39;, &#39;ambassador warn&#39;, &#39;volatile former ambassador&#39;, &#39;tory rebellion&#39;, &#39;death toll exceed&#39;, &#39;long await oversight board&#39;, &#39;face big&#39;]
13
file bankruptcy
5
[&#39;England Wales MPs urge&#39;, &#39;test positive coronaviru&#39;, &#39;dfid merger&#39;, &#39;polo shirt&#39;, &#39;visit ban&#39;, &#39;jogger polo shirt&#39;, &#39;couple test positive coronaviru&#39;, &#39;Berlin couple test positive coronaviru&#39;, &#39;reputation dfid&#39;]
20
report record week
9
[&#39;will consider strike action&#39;, &#39;consider strike action&#39;, &#39;postal chief oust&#39;, &#39;chief oust brother&#39;, &#39;boost eat help&#39;, &#39;postal chief oust brother&#39;, &#39;publisher hit back&#39;, &#39;back office plan&#39;, &#39;court file&#39;]
499
botched ecce homo
3
[&#39;institute say restoration&#39;, &#39;humanoi

In [7]:
def jaccard(set1: np.array, set2: np.array):
    return len(np.intersect1d(set1, set2)) / len(np.union1d(set1, set2))

In [8]:
class ThemeTarget:

    def __init__(self, label: int, name: str):
        self.label = label
        self.name = name

In [9]:
import pandas as pd
from IPython.display import display, HTML

targets: List[ThemeTarget] = [
    ThemeTarget(0, 'Grenfell'),
    ThemeTarget(13, 'Coronavirus students'),
    ThemeTarget(20, 'Shoes'),
    ThemeTarget(399, 'Suleimani'),
    ThemeTarget(599, 'Labour Manifesto'),
]



In [11]:
## find targets which match

def jacc_match(base_articles, mapping_articles_idx):
    clusters = []
    for i, run in enumerate(ap_runs):
        cluster_match = Counter(run.labels[mapping_articles_idx]).most_common();
        main_cluster = cluster_match[0][0]


        theme_articles = [a.id for a in np.array(articles)[np.where(run.labels == main_cluster)]]
        clusters.append(main_cluster)
        if jaccard(theme_articles, base_articles) < 1:
            return False
    return clusters

matching_labels = []

for label_id in range(0, max(ap_runs[0].labels)):
    mapping = ap_runs[0].labels
    mapping_articles_idx = np.where(mapping == label_id)
    base_articles = [a.id for a in np.array(articles)[mapping_articles_idx]]

    match = jacc_match(base_articles, mapping_articles_idx)

    if match:
        print("{} is a match!".format(", ".join(match)))
        matching_labels.append(match)
            

0 is a match!
171 is a match!
314 is a match!
320 is a match!
582 is a match!


In [20]:
for label_id in matching_labels:
    mapping_articles_idx = np.where(ap_runs[0].labels == label_id)
    base_articles = [a.title for a in np.array(articles)[mapping_articles_idx]]
    print("\n".join([art for i, art in enumerate(base_articles) if i < 10]))
    print("\n")

British musicals &#39;at risk without subsidies like other theatre&#39;
Facebook&#39;s long-awaited oversight board to launch before US election
UK repatriates child orphaned in Syria after Isis collapse
Alex Salmond known for &#39;bullying&#39; behaviour, says Scotland&#39;s ex-chief civil servant
‘The US feels very volatile’: former ambassador warns of election violence
Tory rebellion widens over Boris Johnson&#39;s bill to override Brexit deal
Victoria&#39;s new coronavirus cases fall to lowest level since June as Australian death toll exceeds 800
Arsenal place trust in Mikel Arteta with promotion to first-team manager
Victoria&#39;s roadmap out of Covid lockdown is &#39;a sledgehammer approach&#39;, expert says
All of UK facing &#39;big burden&#39; to prevent second Covid wave, warns minister


&#39;Hail, gallant woman&#39;: Amy Dorris praised for coming forward with Trump assault allegation
Which countries can UK holidaymakers visit without restrictions?
Heckling of London mayor S

In [None]:
import pandas as pd
from IPython.display import display, HTML

targets: List[ThemeTarget] = [
    ThemeTarget(0, 'Grenfell'),
    ThemeTarget(13, 'Coronavirus students'),
    ThemeTarget(20, 'Shoes'),
    ThemeTarget(399, 'Suleimani'),
    ThemeTarget(599, 'Labour Manifesto'),
]


In [10]:
for target in targets:

    label_id = target.label
    mapping = ap_runs[0].labels
    mapping_articles_idx = np.where(mapping == label_id)
    base_articles = [a.id for a in np.array(articles)[mapping_articles_idx]]

    data = {}

    for i, run in enumerate(ap_runs):
        cluster_match = Counter(run.labels[mapping_articles_idx]).most_common();
        main_cluster = cluster_match[0][0]
        print(cluster_match)

        theme_articles = [a.id for a in np.array(articles)[np.where(run.labels == main_cluster)]]
        print()
        data[run.name] = [[c.name] + c.theme_words for c in run.clusters if c.id == main_cluster][0]

    display(HTML(pd.DataFrame(data).to_html()))

[(0, 13)]
1.0
[(2, 13)]
1.0
[(0, 13)]
1.0
[(0, 13)]
1.0
[(72, 13)]
1.0


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,new coronavirus case fall low level,to launch before,former ambassador warn,Covid wave warns,Victoria 's
1,volatile former ambassador warn,Boris Johnson 's,feel very,Tory rebellion,over Boris Johnson 's
2,british musical,of Covid lockdown is,second Covid wave warn,says Scotland ex,roadmap out of Covid lockdown
3,former ambassador,Victoria 's,british musical,Covid wave,ex - chief
4,ambassador warn,Facebook 's,Hong Kong security,ex chief civil servant,override Brexit deal
5,volatile former ambassador,Johnson 's,Kong security law,subsidies other,Facebook 's
6,tory rebellion,launch before US election,break international,former ambassador,", say Scotland"
7,death toll exceed,says Scotland 's ex,ex chief,chief civil servant,Scotland 's ex -
8,long await oversight board,Kong security law,break international law,child orphaned,", say Scotland 's ex -"
9,face big,British musicals ',very volatile,US election,Scotland 's


[(13, 5)]
1.0
[(57, 4), (-1, 1)]
0.21052631578947367
[(45, 5)]
0.22727272727272727
[(29, 4), (30, 1)]
0.18181818181818182
[(22, 5)]
0.20833333333333334


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,file bankruptcy,like our future,criminal investigation favor,officer fired photos showing,dealing with Epstein
1,England Wales MPs urge,take over,face existential threat,Covid rules relax,Pope turn tide to
2,test positive coronaviru,court unseals documents related,weekly unemployment claim,facing existential threat,or lose
3,dfid merger,Covid rules relax,criminal investigation favor dictator,officer fired,to dealing with
4,polo shirt,files for bankruptcy,officer fire photo show,unemployment claims rise,third test
5,visit ban,facing existential,MPs urge,public sector workers,dozen more
6,jogger polo shirt,Rishi Sunak warns,McClain officer fire,Home Office,key point from
7,couple test positive coronaviru,in England and Wales,test positive,criminal investigations,document relate to dealing
8,Berlin couple test positive coronaviru,turns tide to,positive coronaviru,weekly unemployment claims rise,Ollie Pope turn tide
9,reputation dfid,Key points,unemployment claim,Raab says Trevelyan,£ 10.9


[(20, 9)]
1.0
[(41, 9)]
0.21951219512195122
[(21, 9)]
0.6
[(16, 9)]
0.6428571428571429
[(35, 9)]
0.8181818181818182


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,report record week,amid calls,Covid lockdown hit,boost eat help scheme begins,exclusive : Trump 's
1,will consider strike action,strike action,childcare shortage,eat help scheme begins,Potter ' : UK bookshop
2,consider strike action,seen anything like this,boost eat help scheme,eat help scheme,win control
3,postal chief oust,stage one,hit back,postal chief,exclusive :
4,chief oust brother,broader response,Covid lockdown,boost eat help scheme,Covid-19 vaccine deal
5,boost eat help,of broader response,teenager shoot,Malawi court,to win control of
6,postal chief oust brother,faces long road,strike action back,UK high streets,chief oust
7,publisher hit back,strike action over,civil servant,Covid-19 vaccine,' : Andy Murray
8,back office plan,court files,back office plan,ban news sharing,Boris Johnson 's
9,court file,app stores puts pressure,president criticism,union will consider strike,Harry Potter ' :


[(399, 9)]
1.0
[(197, 6), (609, 2), (-1, 1)]
0.6
[(66, 6), (215, 3)]
0.42857142857142855
[(96, 8), (-1, 1)]
0.6153846153846154
[(227, 5), (213, 1), (521, 1), (-1, 1), (534, 1)]
0.35714285714285715


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,Depp try stop,Biden tells,law order,tourist quarantines,report say
1,rain batter,time this year,lack access,tried stop,finger cross
2,free tv licence,", court told",brasserie cafe,concussion scare,during raid
3,free tv,this year,mother sue moj,cafes tourist quarantines,sue moj
4,australian journalist secrecy,lack of access,mother sue,tried stop Amber,West Ham offer
5,try stop Amber,this year as,France brasserie cafe tourist,Fingers crossed France,", parliamentary report"
6,sue moj,just disappear,Johnny Depp try stop,Depp tried stop Amber,home flood
7,try stop Amber Heard,lack of access to,face mask,cafes tourist,"should be allow ,"
8,australian journalist secrecy offence,your job,just disappear,law order president lawless,take hundred hostage
9,there yet Alison,First Thing,home flood,Trump law order,' try to stop Amber


[(599, 3)]
1.0
[(526, 3)]
1.0
[(379, 3)]
0.6
[(465, 3)]
1.0
[(677, 3)]
1.0


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,pollutant diesel,at trapping,case drop,berejiklian urged symptoms tested,landfill account for about
1,sydney suburb moorebank bankstown,’s atmosphere absorbed,level record,rob jackson professor stanford,be urge anyone who catch
2,norton liverpool lidcombe,century when concentrations were,stock exchange,also saw little change,farm and milperra
3,lithium ion,"landfills , manure and",Daniel Andrews,oceania largely due,"and milperra ,"
4,sydney suburb moorebank,the gas is released,high level,postponed coronavirus pandemic road freight,seafood in
5,rooftop bar grill,in its contribution,Covid-19 case drop,2030s potential investment,", of the université de"
6,driver also,", ranching , agriculture",increase military pressure,real problem,attribute to fracke and other form of
7,bring total cost low energy,john lewis .,Victoria Daniel,related melbourne,driver – who be
8,area casey,link to lorries,second day,moorebank bankstown chipping norton,"sheep , where"
9,such tesla,to very big,North Korea,also dropped recently,", compare with"
