# TOPIC RANK

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
# this example uses TopicRank
from pke.unsupervised import TopicRank
from collections import Counter
# create a TopicRank extractor
extractor = TopicRank()

# load the content of the document, here in CoreNLP XML format
# the input language is set to English (used for the stoplist)
# normalization is set to stemming (computed with Porter's stemming algorithm)
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language="en",
                        normalization='stemming')

# select the keyphrase candidates, for TopicRank the longest sequences of 
# nouns and adjectives
extractor.candidate_selection(pos={'NOUN', 'PROPN', 'ADJ'})

# weight the candidates using a random walk. The threshold parameter sets the
# minimum similarity for clustering, and the method parameter defines the 
# linkage method
extractor.candidate_weighting(threshold=0.1,
                              method='average')
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

[1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.75000   0.85714       184

    accuracy                        0.75000       184
   macro avg    0.50000   0.37500   0.42857       184
weighted avg    1.00000   0.75000   0.85714       184



  _warn_prf(average, modifier, msg_start, len(result))


# TEXT RANK

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
# this example uses TopicRank
from pke.unsupervised import TopicRank
from collections import Counter
import pke
# this example uses TopicRank
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a TextRank extractor.
extractor = pke.unsupervised.TextRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. build the graph representation of the document and rank the words.
#    Keyphrase candidates are composed from the 33-percent
#    highest-ranked words.
extractor.candidate_weighting(window=2,
                              pos=pos,
                              top_percent=0.33)


# print the n-highest (10) scored candidates


allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=200, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))



[1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.84783   0.91765       184

    accuracy                        0.84783       184
   macro avg    0.50000   0.42391   0.45882       184
weighted avg    1.00000   0.84783   0.91765       184



  _warn_prf(average, modifier, msg_start, len(result))


# SINGLE RANK

In [3]:
import pke

# define the set of valid Part-of-Speeches
pos = {'NOUN', 'PROPN', 'ADJ'}

# 1. create a SingleRank extractor.
extractor = pke.unsupervised.SingleRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. select the longest sequences of nouns and adjectives as candidates.
extractor.candidate_selection(pos=pos)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk. In the graph, nodes are words of
#    certain part-of-speech (nouns and adjectives) that are connected if
#    they occur in a window of 10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=100, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

[1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.56522   0.72222       184

    accuracy                        0.56522       184
   macro avg    0.50000   0.28261   0.36111       184
weighted avg    1.00000   0.56522   0.72222       184



  _warn_prf(average, modifier, msg_start, len(result))


# POSITION RANK

In [4]:
import pke

# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN', 'ADJ'}

# define the grammar for selecting the keyphrase candidates
grammar = "NP: {<ADJ>*<NOUN|PROPN>+}"

# 1. create a PositionRank extractor.
extractor = pke.unsupervised.PositionRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)

# 3. select the noun phrases up to 3 words as keyphrase candidates.
extractor.candidate_selection(grammar=grammar,
                              maximum_word_number=5)

# 4. weight the candidates using the sum of their word's scores that are
#    computed using random walk biaised with the position of the words
#    in the document. In the graph, nodes are words (nouns and
#    adjectives only) that are connected if they occur in a window of
#    10 words.
extractor.candidate_weighting(window=10,
                              pos=pos)

# 5. get the 10-highest scored candidates as keyphrases
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=200, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.89674   0.94556       184

    accuracy                        0.89674       184
   macro avg    0.50000   0.44837   0.47278       184
weighted avg    1.00000   0.89674   0.94556       184



# MULTIPARTITE RANKING

In [5]:
import pke
import string
from nltk.corpus import stopwords

# 1. create a MultipartiteRank extractor.
extractor = pke.unsupervised.MultipartiteRank()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt')

# 3. select the longest sequences of nouns and adjectives, that do
#    not contain punctuation marks or stopwords as candidates.
pos = {'NOUN', 'PROPN', 'ADJ'}
stoplist = list(string.punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']
stoplist += stopwords.words('english')
extractor.candidate_selection(pos=pos, stoplist=stoplist)

# 4. build the Multipartite graph and rank candidates using random walk,
#    alpha controls the weight adjustment mechanism, see TopicRank for
#    threshold/method parameters.
extractor.candidate_weighting(alpha=1.1,
                              threshold=0.74,
                              method='average')

# 5. get the 10-highest scored candidates as keyphrases
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=100, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test1.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.94022   0.96919       184

    accuracy                        0.94022       184
   macro avg    0.50000   0.47011   0.48459       184
weighted avg    1.00000   0.94022   0.96919       184



  _warn_prf(average, modifier, msg_start, len(result))
