In [20]:
import path
with path.Path('..'):
    from services.theme_extractor.preprocessing import ArticlePreprocessJob, ArticlePreprocessor 

    from services.theme_extractor.wv_model import  WVModelBuilder

    from services.theme_extractor.clustering import Clusterer

    from services.theme_extractor.keyword_extraction import KeywordExtractor

    from services.libs.data_model import ProcessedArticle, Theme

In [2]:

apj = ArticlePreprocessJob()

load_id = apj.get_latest_article_load().id
print(load_id)

dfd2c33b-bdb8-456d-bdff-b1e1ab014309


In [3]:
articles = apj.get_articles_for_load(load_id, max_articles=10000)
len(articles)

10000

In [22]:
from typing import List
from gensim.models import Doc2Vec

class APRun:

    processed_articles: List[ProcessedArticle]

    model: Doc2Vec

    labels: List[int]

    clusters: List[Theme]

    name: str

    def __init__(self, name, steps=['lemmatize', 'postag', 'phrasing'], postags=['NOUN', 'ADJ', 'VERB', 'ADV', 'PROPN']):
        self.name = name
        self.steps = steps;
        self.postags = postags;
        # self.processed_articles = processed_articles
        # self.model = model
        # self.labels = labels
        # self.clusters = clusters

    

In [23]:
def run_ap_options(run_options: APRun):

    load_id = run_options.name

    ap = ArticlePreprocessor(steps=run_options.steps, allowed_postags=run_options.postags)
    processed_articles = ap.preprocess_articles(articles, load_id)

    model_builder = WVModelBuilder()

    model = model_builder.build_wv_model(processed_articles)

    labels = Clusterer(model, processed_articles, load_id).create_mapping(min_cluster_size=3, cluster_selection_epsilon=0.1)
    
    clusters = KeywordExtractor(model).create_themes(load_id, processed_articles, labels)

    return processed_articles, model, labels, clusters

def calculate_runs():

    runs: List[APRun] = [
        APRun('run_with_all'),
        APRun('run_with_no_phrasing', steps=['postag', 'lemmatize'])
    ]

    for run in runs:
        processed_articles, model, labels, clusters = run_ap_options(run);
        run.processed_articles = processed_articles
        run.model = model
        run.labels = labels
        run.clusters = clusters

    return runs
    
ap_runs = calculate_runs()

UMAP(a=None, angular_rp_forest=False, b=None, init='spectral',
     learning_rate=1.0, local_connectivity=1.0, metric='cosine',
     metric_kwds=None, min_dist=0.1, n_components=100, n_epochs=None,
     n_neighbors=15, negative_sample_rate=5, random_state=666,
     repulsion_strength=1.0, set_op_mix_ratio=1.0, spread=1.0,
     target_metric='categorical', target_metric_kwds=None,
     target_n_neighbors=-1, target_weight=0.5, transform_queue_size=4.0,
     transform_seed=42, verbose=True)
Construct fuzzy simplicial set
Tue Sep 15 18:44:46 2020 Finding Nearest Neighbors
Tue Sep 15 18:44:46 2020 Building RP forest with 10 trees
Tue Sep 15 18:44:47 2020 NN descent for 13 iterations
	 0  /  13
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
Tue Sep 15 18:44:53 2020 Finished Nearest Neighbor Search
Tue Sep 15 18:44:53 2020 Construct embedding
	completed  0  /  500 epochs
	completed  50  /  500 epochs
	completed  100  /  500 epochs
	completed  150  /  500 epochs
	completed  200  /  50

In [25]:
for i in [1, 11, 21, 31, 41]:
    cluster = ap_runs[1].clusters[i]
    print(cluster.name)
    print(cluster.theme_words)

Blue Origin Musk spacex
['Nasa astronaut', 'Musk spacex', 'Origin Musk spacex', 'astronaut US', 'Musk spacex build new', 'capsule dock', 'spaceflight suddenly resign day', 'crew capsule dock International', 'spaceflight suddenly']
thrash Cologne Frauen Bundesliga
['Cologne Frauen Bundesliga', 'Coventry beat', 'Frauen Bundesliga', 'thrash Cologne', 'Wolfsburg thrash', 'Wolfsburg thrash Cologne Frauen', 'Wolfsburg thrash Cologne', 'thrash Cologne Frauen', 'beat Ipswich']
fury Wilder II
['fury Wilder', 'Anthony Joshua would underdog', 'Joshua would underdog', 'would underdog', 'Joshua would underdog Tyson', 'underdog Tyson Fury', 'would underdog Tyson', 'underdog Tyson Fury say', 'Wilder II fighter']
Rory McIlroy produce shot
['Thomas lead clutch', 'hold Rory McIlroy', 'Hatton hold Rory McIlroy', 'Rory McIlroy', 'Thomas lead clutch top', 'lead clutch', 'Hatton hold Rory', 'produce shot', 'hold nerve secure victory']
teammate give wither
['Jordan teammate', 'bree sorry LeBron', 'LeBron Jam