## TERM FREQUENCY COUNTER BASED ON N-GRAMS

In [None]:
# -*- coding: utf-8 -*-

import logging
import sys
from string import punctuation

from pke import compute_document_frequency

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = './train_data/document/test/'

# path to the df weights dictionary, saved as a gzipped csv file
output_file = "df_kea_test.tsv.gz"

# stoplist are punctuation marks
stoplist = list(punctuation)
stoplist += ['-lrb-', '-rrb-', '-lcb-', '-rcb-', '-lsb-', '-rsb-']

# compute idf weights
compute_document_frequency(input_dir=input_dir,
                           output_file=output_file,
                           extension='txt', # input file extension
                           language='en', # language of the input files
                           normalization="stemming", # use porter stemmer
                           stoplist=stoplist,  # stoplist
                           delimiter='\t',  # tab separated output
                           n=20)  # compute n-grams up to 5-grams

## TRAINING KEA

In [None]:
# -*- coding: utf-8 -*-
#trainining

import logging
import pandas as pd
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = './train_data/document/train/'

# path to the reference file
reference_file = "./train_data/gold-annotation/train_gold.txt"

# path to the df file
df_file = "df_kea_train.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "kea-model.pickle"

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           extension='txt',
                           language='en',
                           normalization="stemming",
                           df=df_counts,
                           model=pke.supervised.Kea())

# TRAINING  WINGUS

In [None]:
# -*- coding: utf-8 -*-
#trainining

import logging
import pandas as pd
import pke

# setting info in terminal
logging.basicConfig(level=logging.INFO)

# path to the collection of documents
input_dir = './train_data/document/train/'

# path to the reference file
reference_file = "./train_data/gold-annotation/train_gold.txt"

# path to the df file
df_file = "df_kea_train.tsv.gz"
logging.info('Loading df counts from {}'.format(df_file))
df_counts = pke.load_document_frequency_file(input_file=df_file,
                                             delimiter='\t')

# path to the model, saved as a pickle
output_mdl = "kea-model.pickle"

pke.train_supervised_model(input_dir=input_dir,
                           reference_file=reference_file,
                           model_file=output_mdl,
                           extension='txt',
                           language='en',
                           normalization="stemming",
                           df=df_counts,
                           model=pke.supervised.WINGUS())

## TESTING

In [None]:
import pke
from collections import Counter
# create a Kea extractor and set the input language to English (used for
# the stoplist in the candidate selection method)
extractor = pke.supervised.Kea()
#extractor = pke.supervised.WINGUS()

# load the content of the document, here in CoreNLP XML format
# the use_lemmas parameter allows to choose using CoreNLP lemmas or stems 
# computed using nltk
extractor.load_document('./train_data/document/train/train.txt')

# select the keyphrase candidates, for Kea the 1-3 grams that do not start or
# end with a stopword.
extractor.candidate_selection()

# load the df counts
df_counts = pke.load_document_frequency_file(input_file="df_kea_train.tsv.gz",
                                             delimiter='\t')

# weight the candidates using Kea model.
extractor.candidate_weighting(model_file="kea-model.pickle", df=df_counts)

# print the n-highest (10) scored candidates
allkeyphrases=[]
# print the n-highest (10) scored candidates
for (keyphrase, score) in extractor.get_n_best(n=10, stemming=False):
    allkeyphrases.append(keyphrase)

df=pd.read_csv("./train_data/tsv/test2.tsv",delimiter="\t")
texts=df["text"]
texts=[i.replace(",","") for i in texts]
labels=df["label"]
labels=[0 if i==0 else 1 for i in labels]

overall_evidence=[]
removed_text=[]
for txt in texts:
    evidence=[]
    for phr in allkeyphrases :
        #print(phr,"--",txt, phr in txt)
        if phr in txt and txt not in removed_text:
            evidence.append(1)
            removed_text.append(txt)
    if evidence==[]:
        evidence=[0]
    overall_evidence.append(evidence)

ypred=[Counter(i).most_common(1)[0][0] for i in overall_evidence]
print(ypred)

from sklearn.metrics import classification_report
print(classification_report(labels,ypred,digits=5))

# KEA Implementation with SVM rather than Naive Bayes

Previously we used KEA that originally is implemented based on https://arxiv.org/abs/cs/9902007. Here we instead reimpliment the same with SVM based model rather than originally used Naive Bayes

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.dummy import DummyClassifier

traindata=pd.read_csv("./train_data/tsv/test1.tsv",delimiter="\t")
X_test=traindata.text.values
Y_test=np.array(traindata.label.values).astype(np.int32)

traindata=pd.read_csv("./train_data/tsv/train1.tsv",delimiter="\t")
X_train=traindata.text.values
Y_train=np.array(traindata.label.values).astype(np.int32)
clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,2))), ('tfidf', TfidfTransformer(use_idf=True)),
                         ('clf', SGDClassifier(loss='log', penalty='l2',alpha=0.001, max_iter=100, random_state=42,class_weight="balanced",warm_start=True))])


Y_test=[0 if i==0 else 1 for i in Y_test]
Y_train=[0 if i==0 else 1 for i in Y_train]

In [None]:
clf.fit(X_train,Y_train)
ypred=clf.predict(X_test)

print(classification_report(Y_test,ypred,digits=5))

