# TOPIC RANK

In [19]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report,f1_score
# this example uses TopicRank
from pke.unsupervised import TopicRank
from collections import Counter
# create a TopicRank extractor
extractor = TopicRank()

# load the content of the document, here in CoreNLP XML format
# the input language is set to English (used for the stoplist)
# normalization is set to stemming (computed with Porter's stemming algorithm)
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language="en",
                        normalization='stemming')

# select the keyphrase candidates, for TopicRank the longest sequences of 
# nouns and adjectives
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the 
# linkage method
extractor.candidate_weighting(threshold=0.1,
                              method='average')

# print the n-highest (10) scored candidates

kcounter={}
for kk in range(1,200):
    allkeyphrases=[]
    for (keyphrase, score) in extractor.get_n_best(n=kk, stemming=False):
        allkeyphrases.append(keyphrase)

    df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
    texts=df["text"]
    texts=[i.replace(",","") for i in texts]
    labels=df["label"]
    labels=[0 if i==0 else 1 for i in labels]

    overall_evidence=[]
    removed_text=[]
    for txt in texts:
        evidence=[]
        for phr in allkeyphrases :
            #print(phr,"--",txt, phr in txt)
            if phr in txt and txt not in removed_text:
                evidence.append(1)
                removed_text.append(txt)
        if evidence==[]:
            evidence=[0]
        overall_evidence.append(evidence)

    ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
    # print(ypred)

    
    f1s=(f1_score(labels,ypred,average='macro'))
    kcounter[kk]=f1s

    
for index,(k,v) in enumerate(zip(kcounter.keys(),kcounter.values())):
    print(v)


0.44105691056910573
0.469083888712578
0.5588493010339575
0.6067337729954877
0.5620004633383529
0.5723805997778602
0.5788288288288288
0.5738636363636364
0.5796649547566642
0.5771793318734566
0.5887445887445887
0.5886213386092747
0.5915138207181607
0.5915138207181607
0.5855010453639324
0.5855010453639324
0.5855010453639324
0.5883769360926722
0.59705964669663
0.6058333333333333
0.6058333333333333
0.6024942853301063
0.5897192458971925
0.578397212543554
0.5771548753115003
0.5799587951122478
0.5799587951122478
0.5827755466309683
0.5757390929965557
0.5757390929965557
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5713716108452951
0.5741405381770482
0.5741405381770482
0.5741405381770482
0.559465281997459
0.559465281997459
0.559465281997459
0.559465281997459
0.559465281997459
0.5621667654028436
0.5621667654028436
0.5621667654028436
0.5621667654028436
0.5648815174698908
0.5676100628930818
0

# TEXT RANK

In [20]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
# this example uses TopicRank
from pke.unsupervised import TopicRank
from collections import Counter
import pke
# this example uses TopicRank
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a TextRank extractor.
extractor = pke.unsupervised.TextRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor.candidate_weighting(window=2,
                              pos=pos,
                              top_percent=0.33)


# print the n-highest (10) scored candidates


kcounter={}
for kk in range(1,200):
    allkeyphrases=[]
    for (keyphrase, score) in extractor.get_n_best(n=kk, stemming=False):
        allkeyphrases.append(keyphrase)

    df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
    texts=df["text"]
    texts=[i.replace(",","") for i in texts]
    labels=df["label"]
    labels=[0 if i==0 else 1 for i in labels]

    overall_evidence=[]
    removed_text=[]
    for txt in texts:
        evidence=[]
        for phr in allkeyphrases :
            #print(phr,"--",txt, phr in txt)
            if phr in txt and txt not in removed_text:
                evidence.append(1)
                removed_text.append(txt)
        if evidence==[]:
            evidence=[0]
        overall_evidence.append(evidence)

    ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
    # print(ypred)

    
    f1s=(f1_score(labels,ypred,average='macro'))
    kcounter[kk]=f1s

    
for index,(k,v) in enumerate(zip(kcounter.keys(),kcounter.values())):
    print(v)




0.25472047389855607
0.260752688172043
0.260752688172043
0.26673148598282287
0.26673148598282287
0.27853258878189624
0.2763157894736843
0.28210999139552856
0.28210999139552856
0.28785498137802606
0.29355161753820697
0.29920074134136454
0.29920074134136454
0.29920074134136454
0.30480317804261464
0.30480317804261464
0.30480317804261464
0.30480317804261464
0.30480317804261464
0.30480317804261464
0.31035973711518505
0.31035973711518505
0.31587121266590934
0.32133838383838387
0.32133838383838387
0.3242857142857143
0.32180073842820284
0.32713098896096504
0.32713098896096504
0.32713098896096504
0.33766859344894024
0.3428773418168965
0.3531781242517711
0.3531781242517711
0.35827147998200626
0.3633276740237691
0.3633276740237691
0.3683473389355742
0.3656419074780859
0.3706039968739533
0.3706039968739533
0.37553112940750333
0.38042389853137515
0.37764665046858725
0.3824850299401198
0.3824850299401198
0.4015151515151515
0.4015151515151515
0.4015151515151515
0.4015151515151515
0.4336547510263089
0.

# SINGLE RANK

In [21]:
import pke

# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
kcounter={}
for kk in range(1,200):
    allkeyphrases=[]
    for (keyphrase, score) in extractor.get_n_best(n=kk, stemming=False):
        allkeyphrases.append(keyphrase)

    df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
    texts=df["text"]
    texts=[i.replace(",","") for i in texts]
    labels=df["label"]
    labels=[0 if i==0 else 1 for i in labels]

    overall_evidence=[]
    removed_text=[]
    for txt in texts:
        evidence=[]
        for phr in allkeyphrases :
            #print(phr,"--",txt, phr in txt)
            if phr in txt and txt not in removed_text:
                evidence.append(1)
                removed_text.append(txt)
        if evidence==[]:
            evidence=[0]
        overall_evidence.append(evidence)

    ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
    # print(ypred)

    
    f1s=(f1_score(labels,ypred,average='macro'))
    kcounter[kk]=f1s

    
for index,(k,v) in enumerate(zip(kcounter.keys(),kcounter.values())):
    print(v)


0.25472047389855607
0.25262909133876876
0.25862907146329606
0.2645762313388974
0.2645762313388974
0.2704715003883979
0.2704715003883979
0.2704715003883979
0.2763157894736843
0.2763157894736843
0.28210999139552856
0.44923627430036317
0.45733333333333337
0.45733333333333337
0.4613519167477248
0.4613519167477248
0.4699235626453972
0.4699235626453972
0.4699235626453972
0.4738520408163266
0.48165551661888073
0.4855311865621144
0.4855311865621144
0.4855311865621144
0.4820954907161804
0.48594042079306377
0.48594042079306377
0.48976887192536045
0.48976887192536045
0.48976887192536045
0.49785646236900605
0.5015809178584185
0.5015809178584185
0.5015809178584185
0.5015809178584185
0.5052910052910053
0.5089870253541244
0.5126692763436309
0.5199936528086322
0.5236363636363637
0.5236363636363637
0.5308842781766969
0.5308842781766969
0.5344900550148117
0.5308098242272745
0.5308098242272745
0.5271164021164021
0.5271164021164021
0.5306856817790948
0.5306856817790948
0.5306856817790948
0.530685681779094

# POSITION RANK

In [22]:
import pke

# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN', 'ADJ'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# 1. create a PositionRank extractor.
extractor = pke.unsupervised.PositionRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=5)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
kcounter={}
for kk in range(1,200):
    allkeyphrases=[]
    for (keyphrase, score) in extractor.get_n_best(n=kk, stemming=False):
        allkeyphrases.append(keyphrase)

    df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
    texts=df["text"]
    texts=[i.replace(",","") for i in texts]
    labels=df["label"]
    labels=[0 if i==0 else 1 for i in labels]

    overall_evidence=[]
    removed_text=[]
    for txt in texts:
        evidence=[]
        for phr in allkeyphrases :
            #print(phr,"--",txt, phr in txt)
            if phr in txt and txt not in removed_text:
                evidence.append(1)
                removed_text.append(txt)
        if evidence==[]:
            evidence=[0]
        overall_evidence.append(evidence)

    ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
    # print(ypred)

    
    f1s=(f1_score(labels,ypred,average='macro'))
    kcounter[kk]=f1s

    
for index,(k,v) in enumerate(zip(kcounter.keys(),kcounter.values())):
    print(v)


0.25472047389855607
0.260752688172043
0.26673148598282287
0.27265781121429405
0.27265781121429405
0.27265781121429405
0.4451570663139188
0.4451570663139188
0.44923627430036317
0.4459565838103244
0.4459565838103244
0.4540233713973537
0.4540233713973537
0.4540233713973537
0.46201141853315764
0.46597680357522875
0.46597680357522875
0.46597680357522875
0.4748010610079576
0.4786487955560858
0.4786487955560858
0.4786487955560858
0.48247985581000846
0.49009324009324007
0.49009324009324007
0.49009324009324007
0.49009324009324007
0.49387619993379683
0.5013962229192309
0.5013962229192309
0.5013962229192309
0.5013962229192309
0.5051339049433683
0.5051339049433683
0.5126692763436309
0.5163380542441914
0.5199936528086322
0.5199936528086322
0.5199936528086322
0.5199936528086322
0.5236363636363637
0.5236363636363637
0.5199936528086322
0.5163380542441914
0.5163380542441914
0.5163380542441914
0.5163380542441914
0.5318152798467759
0.5352071439027961
0.5385906040268457
0.5419659156018068
0.54533333333333

# MULTIPARTITE RANKING

In [23]:
import pke
import pandas as pd
import string
from nltk.corpus import stopwords

# 1. create a MultipartiteRank extractor.
extractor = pke.unsupervised.MultipartiteRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt')

# 3. select the longest sequences of nouns and adjectives, that do
#    not contain punctuation marks or stopwords as candidates.
pos = {'NOUN', 'PROPN', 'ADJ'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)

# 4. build the Multipartite graph and rank candidates using random walk,
#    alpha controls the weight adjustment mechanism, see TopicRank for
#    threshold/method parameters.
extractor.candidate_weighting(alpha=1.1,
                              threshold=0.74,
                              method='average')

# 5. get the 10-highest scored candidates as keyphrases
kcounter={}
for kk in range(1,200):
    allkeyphrases=[]
    for (keyphrase, score) in extractor.get_n_best(n=kk, stemming=False):
        allkeyphrases.append(keyphrase)

    df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
    texts=df["text"]
    texts=[i.replace(",","") for i in texts]
    labels=df["label"]
    labels=[0 if i==0 else 1 for i in labels]

    overall_evidence=[]
    removed_text=[]
    for txt in texts:
        evidence=[]
        for phr in allkeyphrases :
            #print(phr,"--",txt, phr in txt)
            if phr in txt and txt not in removed_text:
                evidence.append(1)
                removed_text.append(txt)
        if evidence==[]:
            evidence=[0]
        overall_evidence.append(evidence)

    ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
    # print(ypred)

    
    f1s=(f1_score(labels,ypred,average='macro'))
    kcounter[kk]=f1s

    
for index,(k,v) in enumerate(zip(kcounter.keys(),kcounter.values())):
    print(v)


0.33401143078562434
0.469083888712578
0.5346373702514358
0.6067337729954877
0.6259775586535193
0.6259775586535193
0.6322313455371651
0.6319250465549349
0.6350391576368329
0.6350391576368329
0.6263145456693844
0.6283783783783783
0.6345294188715719
0.5799923634975181
0.5828609749477078
0.5915138207181607
0.6002449979583504
0.6002449979583504
0.6031746031746033
0.6031746031746033
0.6061144860967057
0.5995748882826492
0.5995748882826492
0.5995748882826492
0.5995748882826492
0.5995748882826492
0.5961411860029371
0.5961411860029371
0.5961411860029371
0.6048850574712644
0.6048850574712644
0.6048850574712644
0.6048850574712644
0.5917909417288042
0.5917909417288042
0.5917909417288042
0.5917909417288042
0.5851323934869688
0.5851323934869688
0.5851323934869688
0.5908382782824836
0.5908382782824836
0.5785326625917311
0.5785326625917311
0.5785326625917311
0.5785326625917311
0.5813397129186604
0.5841607608081765
0.586996336996337
0.586996336996337
0.586996336996337
0.586996336996337
0.58699633699633