## Test Set

In [57]:
import pandas as pd
import numpy as np
import re
import pickle
from pprint import pprint

Load Test Set:

In [2]:
dataset_path = "../data/test_set.pkl"
df_test = pd.read_pickle(dataset_path)
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,N_PD
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,N_PD
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,N_PD
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",N_PD
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,N_PD
...,...,...,...,...
3547,2747.6.25,Automated Reasoning About Tests,\n Theorem 3 For Golog procedure Æ in normal...,N_PD
3548,2747.7.1,Summary,\n In this paper we presented results toward...,N_PD
3549,2747.7.2,Summary,\n We proposed specifying such complex tests...,N_PD
3550,2747.7.3,Summary,\n Sensing is integral to the operation of m...,N_PD


In [3]:
df_test = df_test.dropna(subset=['text_subsection'])

In [4]:
df_test['label_id'] = df_test['label_subsection'].factorize()[0]
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,N_PD,0
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,N_PD,0
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,N_PD,0
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",N_PD,0
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,N_PD,0
...,...,...,...,...,...
3547,2747.6.25,Automated Reasoning About Tests,\n Theorem 3 For Golog procedure Æ in normal...,N_PD,0
3548,2747.7.1,Summary,\n In this paper we presented results toward...,N_PD,0
3549,2747.7.2,Summary,\n We proposed specifying such complex tests...,N_PD,0
3550,2747.7.3,Summary,\n Sensing is integral to the operation of m...,N_PD,0


Info:

In [5]:
print("Subsections in training set = %s" % len(df_test.id_subsection))
id_paper_set = set()
df_test.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]))
print("Valid papers in training set = %s" % len(id_paper_set))

Subsections in training set = 3552
Valid papers in training set = 55


In [6]:
num_pd_subsections = df_test.loc[df_test['label_subsection']=='PD'].shape[0]
num_npd_subsections = df_test.loc[df_test['label_subsection']=='N_PD'].shape[0]
print("Subsections of 'Problem Description/Statement' in test set = %s" % num_pd_subsections)
print("Other subsections in test set = %s" % num_npd_subsections)

Subsections of 'Problem Description/Statement' in test set = 864
Other subsections in test set = 2688


In [7]:
##check:
#if num_pd_subsections == df_test.loc[df_test['label_id']==1].shape[0] and num_npd_subsections == df_test.loc[df_test['label_id']==0].shape[0]:
#    print('ok')
#else:
#    print('fail. re-check')

Text Cleaning:

In [8]:
def initial_text_cleaning(text):
    text = text.lower()                                             # transform to lowercase
    text = re.sub(r'\n', '', text)                                  # remove \n
    text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w\b', '', text)                              # remove all single letters
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text


# cleaning text of stop words
from nltk.corpus import stopwords

def remove_stopwords(text, stopwords):
    words = text.split()
    return ' '.join([w for w in words if w not in stopwords])

# cleaning text of nonsense words
from nltk.corpus import words
words_dictionary = set(words.words())
def remove_nonsensewords(text):
    words = text.split()
    return ' '.join([w for w in words if w in words_dictionary])


# stemming and lemmatization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemming(text):
    words = text.split()
    return ' '.join([porter.stem(w) for w in words])

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
def lemmatization(text):
    words = text.split()
    return ' '.join([wordnet.lemmatize(w) for w in words])

In [9]:
# text - cleaning:
df_test['text_subsection'] = df_test['text_subsection'].apply(initial_text_cleaning)

# remove stop-words:
stopwords_file = "./resources/stopwords_list.txt"
stopwords_extended_list = stopwords.words('english')
with open(stopwords_file, 'r') as file:
    stopwords_extended_list.extend([line.replace('\n', '') for line in file.readlines()])
stopwords_extended_list.extend(['table', 'tab', 'figure', 'fig'])
stopwords = set(stopwords_extended_list)
df_test['text_subsection'] = df_test['text_subsection'].apply(lambda x: remove_stopwords(x, stopwords))

# stemming and lemmatization:
df_test['text_subsection'] = df_test['text_subsection'].apply(stemming)
#df_test['text_subsection'] = df_test['text_subsection'].apply(lemmatization)

# remove nonsense-words:
#df_test['text_subsection'] = df_test['text_subsection'].apply(remove_nonsensewords)

In [10]:
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,N_PD,0
1,2535.2.1,Introduction,complex sophist current gener industri process...,N_PD,0
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,N_PD,0
3,2535.2.3,Introduction,paper propos differ approach problem model com...,N_PD,0
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,N_PD,0
...,...,...,...,...,...
3547,2747.6.25,Automated Reasoning About Tests,theorem golog procedur normal form arbitrari c...,N_PD,0
3548,2747.7.1,Summary,paper present result formal theori test dynam ...,N_PD,0
3549,2747.7.2,Summary,propos specifi complex test logic program lang...,N_PD,0
3550,2747.7.3,Summary,sens integr oper autonom agent notion complex ...,N_PD,0


Load vectorizer:

In [11]:
vectorizer_path = "./resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as feature_extractor:
    vectorizer = pickle.load(feature_extractor)

In [12]:
X_test = vectorizer.transform(df_test['text_subsection'])
X_test.shape

(3552, 40000)

Load classifier:

In [13]:
model_mode = ['lr', 'svc', 'mnb']
y_pred = {}

for m in model_mode:
    classifier_path = "./resources/tdidf_bigr-"+m+".pkl"
    with open(classifier_path, 'rb') as training_model:
        model = pickle.load(training_model)
    y_pred[m] = model.predict(X_test)

Analyze predictions on subsections:

In [14]:
from sklearn.metrics import confusion_matrix, classification_report

for mode, predictions in y_pred.items():
    print("PD estimated with '%s': %s/%s" %(mode, len([i for i in predictions if i == 1]), predictions.shape[0]), '\n')
    print(confusion_matrix(df_test['label_id'], predictions), '\n')
    print(classification_report(df_test['label_id'], predictions, target_names=["N_PD", "PD"]), '\n')
    print('-'*80,'\n')

PD estimated with 'lr': 22/3552 

[[2683    5]
 [ 847   17]] 

              precision    recall  f1-score   support

        N_PD       0.76      1.00      0.86      2688
          PD       0.77      0.02      0.04       864

    accuracy                           0.76      3552
   macro avg       0.77      0.51      0.45      3552
weighted avg       0.76      0.76      0.66      3552
 

-------------------------------------------------------------------------------- 

PD estimated with 'svc': 122/3552 

[[2624   64]
 [ 806   58]] 

              precision    recall  f1-score   support

        N_PD       0.77      0.98      0.86      2688
          PD       0.48      0.07      0.12       864

    accuracy                           0.76      3552
   macro avg       0.62      0.52      0.49      3552
weighted avg       0.69      0.76      0.68      3552
 

-------------------------------------------------------------------------------- 

PD estimated with 'mnb': 0/3552 

[[2688    0]
 

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
#'svc' : recall = 0.07, precision = 0.48 -> significa che solo lo 0.07 delle subsections 'PD' sono state predette come 'PD', mentre delle subsections predette 'PD' solo lo 0.48 erano davvero 'PD' 
#'lr'  : recall = 0.02, precision = 0.77 -> significa che solo lo 0.02 delle subsections 'PD' sono state predette come 'PD', mentre delle subsections predette 'PD' solo lo 0.77 erano davvero 'PD' 
#-> precision = quanti predetti veri su quelli che io ritengo siano veri : TP/(TP+FP)
#-> recall = quanti predetti veri su quelli che sono effettivamente veri : TP/(TP+FN)

Analyze predictions on paragraphs:

In [16]:
df_test_paragraph = df_test
for mode, predictions in y_pred.items():
    df_test_paragraph['predict_id_by_'+mode] = predictions
#trasforma id in quelli dei paragrafi, raggruppali e somma le label:
df_test_paragraph['id_paragraph'] = df_test_paragraph['id_subsection'].apply(lambda x: x[:x.rfind('.')])
df_test_paragraph

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb,id_paragraph
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,N_PD,0,0,0,0,2535.1
1,2535.2.1,Introduction,complex sophist current gener industri process...,N_PD,0,0,0,0,2535.2
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,N_PD,0,0,0,0,2535.2
3,2535.2.3,Introduction,paper propos differ approach problem model com...,N_PD,0,0,0,0,2535.2
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,N_PD,0,0,0,0,2535.2
...,...,...,...,...,...,...,...,...,...
3547,2747.6.25,Automated Reasoning About Tests,theorem golog procedur normal form arbitrari c...,N_PD,0,0,0,0,2747.6
3548,2747.7.1,Summary,paper present result formal theori test dynam ...,N_PD,0,0,0,0,2747.7
3549,2747.7.2,Summary,propos specifi complex test logic program lang...,N_PD,0,0,0,0,2747.7
3550,2747.7.3,Summary,sens integr oper autonom agent notion complex ...,N_PD,0,0,0,0,2747.7


In [17]:
df_test_paragraph.drop('id_subsection', axis=1, inplace=True)
df_test_paragraph.drop('text_subsection', axis=1, inplace=True)
df_test_paragraph.drop('label_subsection', axis=1, inplace=True)

df_test_paragraph = df_test_paragraph.groupby(['id_paragraph', 'paragraph_name']).sum()

In [19]:
for column in df_test_paragraph.columns:
    df_test_paragraph.loc[df_test_paragraph[column] > 0, column] = 1
df_test_paragraph

Unnamed: 0_level_0,Unnamed: 1_level_0,label_id,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
id_paragraph,paragraph_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2535.1,Abstract,0,0,0,0
2535.2,Introduction,0,0,0,0
2535.3,The framework,1,0,1,0
2535.4,Inference,0,0,1,0
2535.5,Experimental Results,0,0,0,0
...,...,...,...,...,...
2747.4,Testing,0,0,0,0
2747.5,Complex Tests,0,0,1,0
2747.6,Automated Reasoning About Tests,0,0,0,0
2747.7,Summary,0,0,0,0


In [20]:
for column in df_test_paragraph.columns:
    if column != 'label_id':
        print(confusion_matrix(df_test_paragraph['label_id'], df_test_paragraph[column]), '\n')
        print(classification_report(df_test_paragraph['label_id'], df_test_paragraph[column], target_names=["N_PD", "PD"]), '\n')
        print('-'*80,'\n')

[[294   3]
 [ 59  14]] 

              precision    recall  f1-score   support

        N_PD       0.83      0.99      0.90       297
          PD       0.82      0.19      0.31        73

    accuracy                           0.83       370
   macro avg       0.83      0.59      0.61       370
weighted avg       0.83      0.83      0.79       370
 

-------------------------------------------------------------------------------- 

[[258  39]
 [ 36  37]] 

              precision    recall  f1-score   support

        N_PD       0.88      0.87      0.87       297
          PD       0.49      0.51      0.50        73

    accuracy                           0.80       370
   macro avg       0.68      0.69      0.68       370
weighted avg       0.80      0.80      0.80       370
 

-------------------------------------------------------------------------------- 

[[297   0]
 [ 73   0]] 

              precision    recall  f1-score   support

        N_PD       0.80      1.00      0.89   

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# un paragrafo è 'PD' se ha almeno una subsection 'PD'
#'svc' : recall = 0.51, precision = 0.49 -> significa che solo lo 0.51 dei paragrafi con almeno una subsections 'PD' è stato predetto come 'PD' (in quanto aventi al loro interno una subsection predetta come 'PD'),
#                                           mentre dei paragrafi 'PD' (cioè con almeno una subsections predetta come 'PD'), solo lo 0.49 avevano una subsections davvero 'PD'.
#'lr'  : recall = 0.19, precision = 0.82 -> significa che solo lo 0.19 dei paragrafi con almeno una subsections 'PD' è stato predetto come 'PD' (in quanto aventi al loro interno una subsection predetta come 'PD'),
#                                           mentre dei paragrafi 'PD' (cioè con almeno una subsections predetta come 'PD'), solo lo 0.82 avevano una subsections davvero 'PD'.
#-> precision = quanti predetti veri su quelli che io ritengo siano veri : TP/(TP+FP)
#-> recall = quanti predetti veri su quelli che sono effettivamente veri : TP/(TP+FN)

In [65]:
# visto che in 'svm' la precision è bassa, che paragrafi rileva come 'PD' e che in realtà non lo sono?!
relevant_paragraphs = {'abstract': 0,
                       'introduction': 0,
                       #'description': 0,  # e.g. "model description"
                       'overview': 0,     # e.g. "system overview"
                       'discussion': 0,
                       'work': 0,         # e.g. "future work", "related work"
                       'result': 0,
                       'conclusion': 0,
                       #'experiment': 0
                      }

total = 0
for index, row in df_test_paragraph.iterrows():
    if row['predict_id_by_svc'] == 1 and row['label_id'] == 0:
        #print(index)
        total += 1
        for key in relevant_paragraphs.keys():
            if key in index[1].lower():
                relevant_paragraphs[key] += 1
#print('\n\n')
pprint(relevant_paragraphs)
print(sum([i for k,i in relevant_paragraphs.items()]), '/', total)

{'abstract': 2,
 'conclusion': 4,
 'discussion': 2,
 'introduction': 10,
 'overview': 0,
 'result': 0,
 'work': 1}
19 / 39


In [None]:
# per la costruzione della ontologia, io prenderei tutti i paragrafi nel dizionario (se non c'è abstract o introduction cmq sia il primo che dà un'idea di quello che è il paper. se il pdf è breve e non ci sono
# paragrafi lo prendo interamente). Prendiamo anche tutti quelli predetti da SVM?? Se si, fin da subito?? oppure andiamo ad estendere l'ontologia un po' alla volta? Perché ci sono molti paper che hanno paragrafi "sporchi"
# nel senso che nel loro formato testuale sono difficili da comprendere.. ovvero hanno molti simboli (tipo aritmetici) o parti scritte di figure o di tabelle o pseudo algoritmi, che rendono difficile la comprensione
# del paragrafo nel caso venga definito come 'PD'... penso che partirei andando a costruire un'ontologia per ciascun paper con i pochi paragrafi del dizionario (in modo da comprenderne intenzioni e contesto 
# e anche risultati), valutare questa ontologia ed eventualmente cercare di espanderla con i paragrafi 'PD'.
# come trattare i paper lunghissimi? (cioè oltre le 100 pagine)