# KPMINER MODEL

In [1]:
import pke
import pandas as pd
from collections import Counter
# 1. create a KPMiner extractor.
extractor = pke.unsupervised.KPMiner()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)


# 3. select {1-5}-grams that do not contain punctuation marks or
#    stopwords as keyphrase candidates. Set the least allowable seen
#    frequency to 5 and the number of words after which candidates are
#    filtered out to 200.
lasf = 5
cutoff = 200
extractor.candidate_selection(lasf=lasf, cutoff=cutoff)

# 4. weight the candidates using KPMiner weighting function.
df = pke.load_document_frequency_file(input_file='../data/df_kea_test.tsv.gz')
alpha = 2.3
sigma = 3.0
extractor.candidate_weighting(df=df, alpha=alpha, sigma=sigma)

# 5. get the 10-highest scored candidates as keyphrases
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=100, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))



[1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.82609   0.90476       184

    accuracy                        0.82609       184
   macro avg    0.50000   0.41304   0.45238       184
weighted avg    1.00000   0.82609   0.90476       184



  _warn_prf(average, modifier, msg_start, len(result))


# YAKE MODEL

In [2]:
import pke
from nltk.corpus import stopwords

# 1. create a YAKE extractor.
extractor = pke.unsupervised.YAKE()

# 2. load the content of the document.
extractor.load_document(input='../../RQ1.1/train_data/document/test/test.txt',
                        language='en',
                        normalization=None)


# 3. select {1-3}-grams not containing punctuation marks and not
#    beginning/ending with a stopword as candidates.
stoplist = stopwords.words('english')
extractor.candidate_selection(n=3, stoplist=stoplist)

# 4. weight the candidates using YAKE weighting scheme, a window (in
#    words) for computing left/right contexts can be specified.
window = 2
use_stems = False # use stems instead of words for weighting
extractor.candidate_weighting(window=window,
                              stoplist=stoplist,
                              use_stems=use_stems)

# 5. get the 10-highest scored candidates as keyphrases.
#    redundant keyphrases are removed from the output using levenshtein
#    distance and a threshold.
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=100, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("../../RQ1.1/train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
              precision    recall  f1-score   support

           0    0.00000   0.00000   0.00000         0
           1    1.00000   0.94022   0.96919       184

    accuracy                        0.94022       184
   macro avg    0.50000   0.47011   0.48459       184
weighted avg    1.00000   0.94022   0.96919       184

