In [1]:
import path
with path.Path('..'):
    from services.theme_extractor.preprocessing import ArticlePreprocessJob, ArticlePreprocessor 

    from services.theme_extractor.wv_model import  WVModelBuilder

    from services.theme_extractor.clustering import Clusterer

    from services.theme_extractor.keyword_extraction import KeywordExtractor

    from services.libs.data_model import ProcessedArticle, Theme

In [2]:

apj = ArticlePreprocessJob()

load_id = apj.get_latest_article_load().id
print(load_id)

467130b3-fe75-4f1f-b2fa-4757e15f11fd


In [3]:
articles = apj.get_articles_for_load(load_id, max_articles=10000)
len(articles)

10000

In [4]:
from typing import List
from gensim.models import Doc2Vec
import numpy as np

class APRun:

    processed_articles: List[ProcessedArticle]

    model: Doc2Vec

    labels: np.array

    clusters: List[Theme]

    name: str

    def __init__(self, name, steps=['lemmatize', 'postag', 'phrasing'], postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
        self.name = name
        self.steps = steps;
        self.postags = postags;
        # self.processed_articles = processed_articles
        # self.model = model
        # self.labels = labels
        # self.clusters = clusters

    

In [40]:
def run_ap_options(run_options: APRun):

    load_id = run_options.name

    ap = ArticlePreprocessor(steps=run_options.steps, allowed_postags=run_options.postags)
    processed_articles = ap.preprocess_articles(articles, load_id)

    model_builder = WVModelBuilder()

    model = model_builder.build_wv_model(processed_articles)

    labels = Clusterer(model, processed_articles, load_id).create_mapping(min_cluster_size=3, cluster_selection_epsilon=0.1)
    
    clusters = KeywordExtractor(model).create_themes(load_id, processed_articles, labels)

    return processed_articles, model, labels, clusters

def calculate_runs():

    runs: List[APRun] = [
        APRun('run_with_all'),
        APRun('run_with_none', steps=[]),
        APRun('run_with_no_phrasing', steps=['postag', 'lemmatize']),
        APRun('run_with_no_lemmatize', steps=['postag', 'phrasing']),
        APRun('run_with_no_postag', steps=['lemmatize', 'phrasing'])

    ]

    for run in runs:
        processed_articles, model, labels, clusters = run_ap_options(run);
        run.processed_articles = processed_articles
        run.model = model
        run.labels = labels
        run.clusters = clusters
 
    return runs
    
ap_runs = calculate_runs()

UMAP(a=None, angular_rp_forest=False, b=None, init=&#39;spectral&#39;,
     learning_rate=1.0, local_connectivity=1.0, metric=&#39;cosine&#39;,
     metric_kwds=None, min_dist=0.1, n_components=100, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=666,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric=&#39;categorical&#39;, target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Sun Oct  4 12:45:01 2020 Finding Nearest Neighbors
Sun Oct  4 12:45:01 2020 Building RP forest with 10 trees
Sun Oct  4 12:45:03 2020 NN descent for 13 iterations
	 0  /  13
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
Sun Oct  4 12:45:10 2020 Finished Nearest Neighbor Search
Sun Oct  4 12:45:15 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	complete

In [6]:
from collections import Counter

counts = Counter(ap_runs[0].labels)

for i in [1, 14, 21, 500, 600]:
    cluster = ap_runs[0].clusters[i]
    print(cluster.id)
    print(cluster.name)
    print(counts[cluster.id])
    print(cluster.theme_words)

0
new coronavirus case fall low level
13
[&#39;volatile former ambassador warn&#39;, &#39;british musical&#39;, &#39;former ambassador&#39;, &#39;ambassador warn&#39;, &#39;volatile former ambassador&#39;, &#39;tory rebellion&#39;, &#39;death toll exceed&#39;, &#39;long await oversight board&#39;, &#39;face big&#39;]
13
file bankruptcy
5
[&#39;England Wales MPs urge&#39;, &#39;test positive coronaviru&#39;, &#39;dfid merger&#39;, &#39;polo shirt&#39;, &#39;visit ban&#39;, &#39;jogger polo shirt&#39;, &#39;couple test positive coronaviru&#39;, &#39;Berlin couple test positive coronaviru&#39;, &#39;reputation dfid&#39;]
20
report record week
9
[&#39;will consider strike action&#39;, &#39;consider strike action&#39;, &#39;postal chief oust&#39;, &#39;chief oust brother&#39;, &#39;boost eat help&#39;, &#39;postal chief oust brother&#39;, &#39;publisher hit back&#39;, &#39;back office plan&#39;, &#39;court file&#39;]
499
botched ecce homo
3
[&#39;institute say restoration&#39;, &#39;humanoi

In [7]:
def jaccard(set1: np.array, set2: np.array):
    return len(np.intersect1d(set1, set2)) / len(np.union1d(set1, set2))

In [8]:
class ThemeTarget:

    def __init__(self, label: int, name: str):
        self.label = label
        self.name = name

In [9]:
import pandas as pd
from IPython.display import display, HTML

targets: List[ThemeTarget] = [
    ThemeTarget(0, 'Grenfell'),
    ThemeTarget(13, 'Coronavirus students'),
    ThemeTarget(20, 'Shoes'),
    ThemeTarget(399, 'Suleimani'),
    ThemeTarget(599, 'Labour Manifesto'),
]



In [43]:
## find targets which match

def jacc_match(base_articles, mapping_articles_idx):
    clusters = []
    for i, run in enumerate(ap_runs):
        cluster_match = Counter(run.labels[mapping_articles_idx]).most_common();
        main_cluster = cluster_match[0][0]


        theme_articles = [a.id for a in np.array(articles)[np.where(run.labels == main_cluster)]]
        clusters.append(str(main_cluster))
        if jaccard(theme_articles, base_articles) < .9:
            return False
    return clusters

matching_labels = []

for label_id in range(0, max(ap_runs[0].labels)):
    mapping = ap_runs[0].labels
    mapping_articles_idx = np.where(mapping == label_id)
    base_articles = [a.id for a in np.array(articles)[mapping_articles_idx]]

    match = jacc_match(base_articles, mapping_articles_idx)

    if match:
        print("{} is a match!".format(", ".join(match)))
        matching_labels.append(match)
            

1, 0, 1, 0, 0 is a match!
41, 18, 0, 1, 26 is a match!
99, 192, 125, 272, 178 is a match!
203, 369, 337, 367, 280 is a match!
356, 11, 57, 87, 233 is a match!


In [46]:
for label_ids in matching_labels:
    base_articles = [a.title for a in np.array(articles)[np.where(ap_runs[0].labels == int(label_ids[0]))]]
    print("\n".join([art for i, art in enumerate(base_articles) if i < 10]))
    print("\n")

Chess: David Howell and Michael Adams battle for England No 1 spot in Torquay
Chess: Magnus Carlsen at peak but faces Saturday test against Wesley So
Chess: Michael Adams wins seven as England&#39;s top players battle lockdown
Chess: Kasparov and Carlsen undone by internet glitches following 55-move draw
Chess: Garry Kasparov and Magnus Carlsen draw in historic encounter
Chess: Russia and India&#39;s shared Olympiad gold sparks wave of criticism
Chess: Six-way tie at Hastings as tournament battles richer rivals
India awarded chess gold with Russia after server outage leads to reprieve
Chess: Garry Kasparov and Magnus Carlsen to meet for first time in 16 years
Chess: Carlsen fights back from brink to overcome Nakamura in 38-game epic


Revealed: ex-MPs use parliament access passes over 2,500 times in a year
Peers call on Jenrick  to explain opposition to smoke-free zones
Tory peer accused of breaching ministerial code with Uganda deals
Robert Jenrick says he regrets dining with donor be

In [33]:
print(articles[100].title)

print(ap_runs[0].processed_articles[100].title_words)

Inquest into suicide of gambling addict will explore if UK state failed him
[&#39;inquest&#39;, &#39;suicide&#39;, &#39;gambling&#39;, &#39;addict&#39;, &#39;will&#39;, &#39;explore&#39;, &#39;UK&#39;, &#39;state&#39;, &#39;fail&#39;]


In [28]:
import pandas as pd
from IPython.display import display, HTML

targets: List[ThemeTarget] = [
    ThemeTarget(0, 'Grenfell'),
    ThemeTarget(13, 'Coronavirus students'),
    ThemeTarget(20, 'Shoes'),
    ThemeTarget(399, 'Suleimani'),
    ThemeTarget(599, 'Labour Manifesto'),
]


In [45]:
for label_ids in matching_labels:

    for i, run in enumerate(ap_runs):
        
        main_cluster = int(label_ids[i])

        theme_articles = [a.id for a in np.array(articles)[np.where(run.labels == main_cluster)]]
        data[run.name] = [[c.name] + c.theme_words for c in run.clusters if c.id == main_cluster][0]

    display(HTML(pd.DataFrame(data).to_html()))

Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,chess Garry,chess gold with,chess Garry Kasparov,Chess Adams stirs,15-match winning
1,chess Hou,awarded chess,chess Hou Yifan,India awarded chess,tops fantasy
2,chess Garry Kasparov,awarded chess gold with,chess England,awarded chess,as tournament
3,chess Michael Adams,15-match winning,chess Hou,Magnus Carlsen,-PRON- 15-match winning
4,chess Hou Yifan,mate in,chess Garry,Carlsen misses mate over-50s,mate in
5,chess Kasparov,India awarded chess,chess Kasparov,Kasparov Magnus Carlsen,rout Caruana and tops
6,Magnus Carlsen,and tops fantasy,chess David Howell Michael,Adams stirs,", no 1"
7,Garry Kasparov Magnus Carlsen,prize despite bizarre four,chess Adams stir,alias Biel,"Hou Yifan , no 1"
8,Kasparov Magnus Carlsen,chess gold,Garry Kasparov Magnus Carlsen,Chess Carlsen routs,and tops fantasy football league
9,chess Carlsen rout,prize despite bizarre,award chess,Carlsen routs,prize despite bizarre


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,smoke free,"passes over 2,500 times",grow pressure,Marxists doe,from developer
1,give almost tory,", Tory donations",Desmond former porn baron,Jenrick media mogul,after fresh Desmond revelation
2,Marxists doe,for favours,want give Marxists doe,want give Marxists doe texts,fund from developer
3,Jenrick face question,"2,500 times",Desmond Jenrick,Jenrick says regrets,donor before planning decision
4,peer accuse breach,suggests voters could raise,swift return,pressure resign donor,with Uganda deal
5,peer accuse breach ministerial code,Robert Jenrick faces questions,smoke free,doe texts Desmond,Minister suggest voter could
6,1bn housing,swift return,israeli billionaire meet,Robert Jenrick says regrets,long list of
7,Jenrick cash favour scandal,planning row,swift return lockdown,planning row,under grow pressure
8,could raise,do n't want to,Jenrick grow pressure,Marxists doe texts Desmond,-PRON- do not want to give
9,give Marxists doe,faces questions over,meet israeli mining heir,Jenrick long list questions,£ 1bn land deal


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,australian university walk back,ballooning enrolments,ballooning enrolment,ballooning enrolments,criticism of high education
1,australian university cahoot,and ballooning enrolments,unit monitor enrolment,unit monitor enrolments,university and
2,australian university plead,of university,monitor enrolment,walk back criticism higher education,back -PRON- criticism of high education
3,ballooning enrolment,’s uni,cut ballooning enrolment,university statistics,effect of university
4,australian university walk back criticism,to monitor enrolments,university urge coalition,universities cahoots,-PRON- criticism of high education
5,australian university cahoot government,cuts and ballooning enrolments,australian university plead,decline job cuts ballooning enrolments,modelling on effect of university
6,unit monitor enrolment,"courses , but fall",university plead fee rise,Universities blindsided,Sydney university
7,more university job,unit to monitor enrolments,almost half australian phd,mischievous error university,on effect of university fee
8,accuse australian university cahoot government,universities of,australian phd,merge faculties,university warn
9,struggle university,"hikes , official reveals",Sydney university ask staff,criticism higher education,australian university of


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,project australian university,universities ' fears over,blindside government,universities fears,"foreign government ,"
1,ten thousand research project australian university,universities ' blindsided ',allay university fear foreign,universities smacks McCarthyist,australian university ' blindside
2,groundless vilification,universities ' blindsided,university smack McCarthyist campaign,briefing allay universities fears,australian university ' blindside '
3,foreign government Albanese say,briefing to allay universities,foreign government Albanese,unwind Darwin port sale,", australian university"
4,research project australian university,to allay universities,vilification work,universities blindsided,"foreign government , Albanese say"
5,smack McCarthyist campaign say,by universities,global deal university smack,universities smacks McCarthyist campaign,"project , australian university"
6,australian university blindside government seek,to unwind Darwin port,university blindside government,foreign interference Australian academia,"foreign government , Albanese"
7,researcher condemn groundless,universities smacks of ‘,blindside government seek,allay universities fears,"research project , australian university"
8,deal university smack,deals by universities,override state pact foreign,unwind Darwin port,foreign interference in
9,foreign government Albanese,unwind Darwin port sale,foreign government Albanese say,universities blindsided government seeking,australian university '


Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,put gaelic,gaelic committee,gaelic job,life gaelic,", a gaelic"
1,gaelic give precedence parity,the gaelic,say gaelic,gaelic said,of gaelic be
2,trend put gaelic,gaelic .,charge promote gaelic,said gaelic,. gaelic
3,gaelic say,as a gaelic,last week gaelic,gaelic inextricably linked,", "" -PRON- say . gaelic"
4,say gaelic give precedence,“ the gaelic,put gaelic,habitual gaelic,the gaelic
5,siar increase gaelic,", a gaelic",job future gaelic,bòrd gàidhlig official gaelic,may assume that the gaelic
6,gaelic figure put,gaelic in,gaelic fall,however gaelic,of gaelic
7,may assume gaelic,decline of gaelic .,gaelic job future gaelic,manx gaelic isle man,by gaelic
8,say gaelic,. gaelic,gaelic job future,gaelic isle man,". "" the gaelic"
9,invest gaelic,gaelic in the,protect gaelic,investing gaelic,"gaelic ,"


In [48]:
## get correlations
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score
import pandas as pd 

data = {}
for run in ap_runs:
    vals = []
    for run2 in ap_runs:
        vals.append(adjusted_mutual_info_score(run.labels, run2.labels))
    data[run.name] = vals

pd.DataFrame(data)

Unnamed: 0,run_with_all,run_with_none,run_with_no_phrasing,run_with_no_lemmatize,run_with_no_postag
0,1.0,0.535477,0.619525,0.613201,0.550287
1,0.535477,1.0,0.540951,0.536786,0.545302
2,0.619525,0.540951,1.0,0.581408,0.534625
3,0.613201,0.536786,0.581408,1.0,0.540062
4,0.550287,0.545302,0.534625,0.540062,1.0
