In [1]:
import path
with path.Path('..'):
    from services.theme_extractor.preprocessing import ArticlePreprocessJob, ArticlePreprocessor 

    from services.theme_extractor.wv_model import  WVModelBuilder, WVModelJob

    from services.theme_extractor.clustering import Clusterer, ClusterJob

    from services.theme_extractor.keyword_extraction import KeywordExtractor

    from services.libs.data_model import ProcessedArticle, Theme

In [2]:
wvm = WVModelJob()

al = wvm.get_latest_article_load()

model = wvm.get_model_from_disk(al.id)

cj = ClusterJob(model, al.id)

In [3]:
articles = cj.filter_articles()

In [4]:

c = Clusterer(model, articles, al.id, from_scratch=True, min_cluster_size=3, cluster_selection_epsilon=0.1)


In [5]:
mapping = c.create_mapping()

In [6]:
import numpy as np
import en_core_web_sm

articles_in_class = np.array(articles)[:len(mapping)][mapping == 1]


titles = [art.title_words for art in articles_in_class]

vecs = list([model.docvecs[doc.id] for doc in articles_in_class])

In [7]:
import pandas as pd
def test_word_extraction_method(docs, method):

    keywords = method(docs);
    return keywords



In [14]:
%%cython

from typing import List
def generate_cngrams(words_list: List[str], n: int):
    ngrams_list = []
 
    for num in range(0, len(words_list) - (n - 1)):
        ngram = (words_list[num:num + n])
        ngrams_list.append(ngram)
 
    return ngrams_list

In [15]:
from scipy import spatial
from sklearn.metrics.pairwise import cosine_similarity

import math
import itertools
import statistics as s

from collections import Counter


def extract_ngram_keywords(articles_in_class):
    n_grams = []

    for i, art in enumerate(articles_in_class):
        words =  art.title_words if len(articles_in_class) > 5 else art.words;
        # n_grams += [[word] for word in words]
        n_grams += generate_cngrams(words, 2)
        n_grams += generate_cngrams(words, 3)
        n_grams += generate_cngrams(words, 4)


    print('n_grams obtained - there are {}'.format(len(n_grams)))

    n_grams.sort()
    n_grams = list(n_grams for n_grams,_ in itertools.groupby(n_grams))

    print('dupes removed - there are now {}'.format(len(n_grams)))

    scores = {}
    docvecs = np.array(vecs)

    p_vecs = [model.infer_vector(n_gram, steps=10) for n_gram in n_grams]
    p_vecs_arr = np.array(p_vecs).reshape(len(p_vecs), 400)
    sims = np.mean(cosine_similarity(p_vecs_arr, docvecs), axis=1)

    scores = dict(enumerate(sims))
    
    def convert_ngram_to_string(ngram: List[str]):
        return " ".join(ngram).replace("_", " ")

    return [convert_ngram_to_string(n_grams[p[0]]) for p in Counter(scores).most_common(10)]
    


In [10]:
a = np.array([
    [1 ,2 ,3],
    [3, 2, 1]
])

b = np.array([
    [1 ,2 ,3],
    [3, 2, 1],
    [2, 4, 6]
])

sim = cosine_similarity(a,b)

np.mean(sim, axis = 1)

array([0.9047619 , 0.80952381])

In [11]:
def get_most_common_words(articles_in_class):

    words = sum([art.words for art in articles_in_class], [])

    return [w[0] for w in Counter(words).most_common(10)]

def get_most_common_title_words(articles_in_class):

    words = sum([art.title_words for art in articles_in_class], [])

    return [w[0] for w in Counter(words).most_common(10)]

In [12]:
from gensim.summarization import keywords

def textrank_vanilla_blob(articles_in_class):
    words = []
    for article in articles_in_class:
        words += article.words
        
        
    
    return keywords(" ".join(words), split=True)[:10]

def textrank_vanilla(articles_in_class):
    kwords = []
    for article in articles_in_class:
        kwords += keywords(" ".join(article.words), split=True)
    
    return [w[0] for w in Counter(kwords).most_common(10)]

In [16]:
from IPython.display import display, HTML



def get_performance_for_keyword_extraction(name, keyword_extraction_method):

    descs = [
        'Boxing News',
        'Home Office English Tests Scandal',
        'Littering in the UK',
        'US Water Shutoffs',
        'South Korea Election 2020'
    ]

    data = {}

    for i, theme_id in enumerate([1, 10, 20, 64, 95]):
        articles_in_class = np.array(articles)[:len(mapping)][mapping == theme_id]
        desc = descs[i]
        data[desc] = keyword_extraction_method(articles_in_class)

    
    df = pd.DataFrame(data)

    display(HTML(df.to_html()))


methods = [
    ('ngram method', extract_ngram_keywords),
    ('common words', get_most_common_words),
    ('common words (from title)', get_most_common_title_words),
    ('textrank + vote', textrank_vanilla),
    ('textrank over all articles', textrank_vanilla_blob)
]

for name, meth in methods:
    get_performance_for_keyword_extraction(name, meth);
        
        # print([art.title for art in articles_in_class][:10])

n_grams obtained - there are 807
dupes removed - there are now 757
n_grams obtained - there are 138
dupes removed - there are now 132
n_grams obtained - there are 3591
dupes removed - there are now 3484
n_grams obtained - there are 114
dupes removed - there are now 108
n_grams obtained - there are 2682
dupes removed - there are now 2394


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,Andy Ruiz Jr rematch,english language test,employ company,moratorium end,testing monitoring
1,WBC heavyweight title,student accuse cheat,defra spokeswoman say,water bill rise,monitoring social distancing strict quarantine
2,boxer Maxim Dadashev,MPs hold inquiry,represent more half council england wales,clean water shutoff,check temperature
3,Joshua Ruiz Jr rematch,english test cheat,fly tipping penalty increase,layoff trigger,medical worker
4,boxer Maxim,act immigration,resource waste,million Americans,majority national assembly
5,fight Nurmagomedov Diaz Mayweather,hold inquiry,harm pet wildlife,million US,worry so be sure
6,Logan Paul boxing,face legal action,waste clearance,water shutoff,fall mortality rate
7,fight Kubrat Pulev,english test,local government association say council,trigger pandemic,accord korea
8,rematch confirm,should replace,tough guideline issue,bill rise,monitoring social distancing strict
9,fight Nurmagomedov Diaz,english test cheat claim,effective punitive deterrent,shutoff layoff trigger,win seat assembly


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight,home_office,council,water,south_korea
1,say,student,fine,bill,party
2,joshua,test,issue,shutoff,say
3,will,say,litter,city,election
4,go,cheat,say,say,moon
5,fury,uk,more,detroit,outbreak
6,when,english,also,household,polling_station
7,would,people,fly_tipping,resident,hold
8,fighter,study,fly,people,vote
9,ruiz,government,increase,running_water,voter


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,Anthony_Joshua,test,council,water,South_Korea
1,Ruiz,Home_Office,litter,shutoff,rule
2,Andy,English,unpunishe,million,party
3,Jr,scandal,many,US,win_election
4,fight,cheat,England_Wales,Detroit,landslide
5,Tyson,student,tough_penalty,pandemic,coronavirus_outbreak
6,Fury,MPs,need,suspend,south_korean
7,rematch,UK,curb,face,voter
8,Eddie,face,surge,lose,expect
9,Hearn,rush,fly,water_supply,return


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight,student,government,people,moon
1,boxing,testing,average,pandemic,coronavirus
2,joshua,people,councillor,month,glove
3,time,visa,use,running_water,vote
4,want,uk_government,council,family,party
5,ruiz,year,year,department,south_korea
6,round,fraud,sentence,water,elect
7,box,accusation,need,more_people,hold
8,look,questionable,fine week litterer,utility,country
9,know,cheating,council issue,work,wear_mask


Unnamed: 0,Boxing News,Home Office English Tests Scandal,Littering in the UK,US Water Shutoffs,South Korea Election 2020
0,fight_back,home_office,council issue,water,voting
1,fighting,student,littering,city,south_korea rule party
2,fights,cheat,fine week litterer,people,voter
3,confirm fight world,government,last_year,many_people,vote about_people
4,ruiz_jr,uk_government,relate litter,least_people,elect
5,box right_now,organise cheating,fly_tipping,pandemic,coronavirus
6,time,last_year,year enforcement,detroiter result,polling_station
7,time_when,almost_year,increase,resident,national_assembly election
8,times,people,increase_amount,pay,country
9,timing,hundred_people,more_people,afford,other_country
