## Test Set

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from pprint import pprint

Load Test Set:

In [2]:
dataset_path = "../data/test_set.pkl"
df_test = pd.read_pickle(dataset_path)
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,N_PD
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,N_PD
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,N_PD
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",N_PD
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,N_PD
...,...,...,...,...
6173,2930.3.7,Architecture and Decision procedure,"\n 5. Otherwise, the agent computes based on...",PD
6174,2930.3.8,Architecture and Decision procedure,\n Finally choose argument or offer that pro...,PD
6175,2930.3.9,Architecture and Decision procedure,\n We have outlined a framework using which ...,PD
6176,2930.3.10,Architecture and Decision procedure,\n We have presented an asymmetric negotiati...,PD


In [3]:
df_test = df_test.dropna(subset=['text_subsection'])

In [4]:
df_test['label_id'] = df_test['label_subsection'].factorize()[0]
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2535.1.1,Abstract,\n This paper addresses the problem of track...,N_PD,0
1,2535.2.1,Introduction,\n The complexity and sophistication of the ...,N_PD,0
2,2535.2.2,Introduction,\n 2000)). We want to monitor the state of t...,N_PD,0
3,2535.2.3,Introduction,"\n In this paper, we propose a different app...",N_PD,0
4,2535.2.4,Introduction,\n which are expressed as discrete failure m...,N_PD,0
...,...,...,...,...,...
6173,2930.3.7,Architecture and Decision procedure,"\n 5. Otherwise, the agent computes based on...",PD,1
6174,2930.3.8,Architecture and Decision procedure,\n Finally choose argument or offer that pro...,PD,1
6175,2930.3.9,Architecture and Decision procedure,\n We have outlined a framework using which ...,PD,1
6176,2930.3.10,Architecture and Decision procedure,\n We have presented an asymmetric negotiati...,PD,1


Info:

In [5]:
print("Subsections in training set = %s" % len(df_test.id_subsection))
id_paper_set = set()
df_test.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]))
print("Valid papers in training set = %s" % len(id_paper_set))

Subsections in training set = 6178
Valid papers in training set = 100


In [6]:
num_pd_subsections = df_test.loc[df_test['label_subsection']=='PD'].shape[0]
num_npd_subsections = df_test.loc[df_test['label_subsection']=='N_PD'].shape[0]
print("Subsections of 'Problem Description/Statement' in test set = %s" % num_pd_subsections)
print("Other subsections in test set = %s" % num_npd_subsections)

Subsections of 'Problem Description/Statement' in test set = 1491
Other subsections in test set = 4687


In [7]:
##check:
#if num_pd_subsections == df_test.loc[df_test['label_id']==1].shape[0] and num_npd_subsections == df_test.loc[df_test['label_id']==0].shape[0]:
#    print('ok')
#else:
#    print('fail. re-check')

Text Cleaning:

In [8]:
def initial_text_cleaning(text):
    text = text.lower()                                             # transform to lowercase
    text = re.sub(r'\n', '', text)                                  # remove \n
    text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w\b', '', text)                              # remove all single letters
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text


# cleaning text of stop words
from nltk.corpus import stopwords

def remove_stopwords(text, stopwords):
    words = text.split()
    return ' '.join([w for w in words if w not in stopwords])

# cleaning text of nonsense words
from nltk.corpus import words
words_dictionary = set(words.words())
def remove_nonsensewords(text):
    words = text.split()
    return ' '.join([w for w in words if w in words_dictionary])


# stemming and lemmatization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemming(text):
    words = text.split()
    return ' '.join([porter.stem(w) for w in words])

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
def lemmatization(text):
    words = text.split()
    return ' '.join([wordnet.lemmatize(w) for w in words])

In [9]:
# text - cleaning:
df_test['text_subsection'] = df_test['text_subsection'].apply(initial_text_cleaning)

# remove stop-words:
stopwords_file = "./resources/stopwords_list.txt"
stopwords_extended_list = stopwords.words('english')
with open(stopwords_file, 'r') as file:
    stopwords_extended_list.extend([line.replace('\n', '') for line in file.readlines()])
stopwords_extended_list.extend(['table', 'tab', 'figure', 'fig'])
stopwords = set(stopwords_extended_list)
df_test['text_subsection'] = df_test['text_subsection'].apply(lambda x: remove_stopwords(x, stopwords))

# stemming and lemmatization:
df_test['text_subsection'] = df_test['text_subsection'].apply(stemming)
#df_test['text_subsection'] = df_test['text_subsection'].apply(lemmatization)

# remove nonsense-words:
#df_test['text_subsection'] = df_test['text_subsection'].apply(remove_nonsensewords)

In [10]:
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,N_PD,0
1,2535.2.1,Introduction,complex sophist current gener industri process...,N_PD,0
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,N_PD,0
3,2535.2.3,Introduction,paper propos differ approach problem model com...,N_PD,0
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,N_PD,0
...,...,...,...,...,...
6173,2930.3.7,Architecture and Decision procedure,agent comput base oppon belief model expect ut...,PD,1
6174,2930.3.8,Architecture and Decision procedure,final choos argument offer produc maximum expe...,PD,1
6175,2930.3.9,Architecture and Decision procedure,outlin framework oppon model gener argument co...,PD,1
6176,2930.3.10,Architecture and Decision procedure,present asymmetr negoti model knowledg domain ...,PD,1


Load vectorizer:

In [11]:
vectorizer_path = "./resources/tdidf_bigram_vectorizer.pkl"
with open(vectorizer_path, 'rb') as feature_extractor:
    vectorizer = pickle.load(feature_extractor)

In [12]:
X_test = vectorizer.transform(df_test['text_subsection'])
X_test.shape

(6178, 40000)

Load classifier:

In [13]:
model_mode = ['lr', 'svc', 'mnb']
y_pred = {}

for m in model_mode:
    classifier_path = "./resources/tdidf_bigr-"+m+".pkl"
    with open(classifier_path, 'rb') as training_model:
        model = pickle.load(training_model)
    y_pred[m] = model.predict(X_test)

Analyze predictions on subsections:

In [14]:
from sklearn.metrics import confusion_matrix, classification_report

for mode, predictions in y_pred.items():
    print("PD estimated with '%s': %s/%s" %(mode, len([i for i in predictions if i == 1]), predictions.shape[0]), '\n')
    print(confusion_matrix(df_test['label_id'], predictions), '\n')
    print(classification_report(df_test['label_id'], predictions, target_names=["N_PD", "PD"]), '\n')
    print('-'*80,'\n')

PD estimated with 'lr': 49/6178 

[[4672   15]
 [1457   34]] 

              precision    recall  f1-score   support

        N_PD       0.76      1.00      0.86      4687
          PD       0.69      0.02      0.04      1491

    accuracy                           0.76      6178
   macro avg       0.73      0.51      0.45      6178
weighted avg       0.75      0.76      0.67      6178
 

-------------------------------------------------------------------------------- 

PD estimated with 'svc': 210/6178 

[[4579  108]
 [1389  102]] 

              precision    recall  f1-score   support

        N_PD       0.77      0.98      0.86      4687
          PD       0.49      0.07      0.12      1491

    accuracy                           0.76      6178
   macro avg       0.63      0.52      0.49      6178
weighted avg       0.70      0.76      0.68      6178
 

-------------------------------------------------------------------------------- 

PD estimated with 'mnb': 0/6178 

[[4687    0]
 

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
#'svc' : recall = 0.07, precision = 0.49 -> significa che solo lo 7% delle subsections 'PD' sono state predette come 'PD', mentre delle subsections predette 'PD' solo lo 49% erano davvero 'PD' 
#'lr'  : recall = 0.02, precision = 0.69 -> significa che solo lo 2% delle subsections 'PD' sono state predette come 'PD', mentre delle subsections predette 'PD' solo lo 69% erano davvero 'PD' 

Analyze predictions on paragraphs:

In [16]:
df_test_paragraph = df_test
for mode, predictions in y_pred.items():
    df_test_paragraph['predict_id_by_'+mode] = predictions
#trasforma id in quelli dei paragrafi, raggruppali e somma le label:
df_test_paragraph['id_paragraph'] = df_test_paragraph['id_subsection'].apply(lambda x: x[:x.rfind('.')])
df_test_paragraph

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb,id_paragraph
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,N_PD,0,0,0,0,2535.1
1,2535.2.1,Introduction,complex sophist current gener industri process...,N_PD,0,0,0,0,2535.2
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,N_PD,0,0,0,0,2535.2
3,2535.2.3,Introduction,paper propos differ approach problem model com...,N_PD,0,0,0,0,2535.2
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,N_PD,0,0,0,0,2535.2
...,...,...,...,...,...,...,...,...,...
6173,2930.3.7,Architecture and Decision procedure,agent comput base oppon belief model expect ut...,PD,1,0,0,0,2930.3
6174,2930.3.8,Architecture and Decision procedure,final choos argument offer produc maximum expe...,PD,1,0,0,0,2930.3
6175,2930.3.9,Architecture and Decision procedure,outlin framework oppon model gener argument co...,PD,1,0,0,0,2930.3
6176,2930.3.10,Architecture and Decision procedure,present asymmetr negoti model knowledg domain ...,PD,1,0,0,0,2930.3


In [17]:
df_test_paragraph.drop('id_subsection', axis=1, inplace=True)
df_test_paragraph.drop('text_subsection', axis=1, inplace=True)
df_test_paragraph.drop('label_subsection', axis=1, inplace=True)

df_test_paragraph = df_test_paragraph.groupby(['id_paragraph', 'paragraph_name']).sum()

In [19]:
for column in df_test_paragraph.columns:
    df_test_paragraph.loc[df_test_paragraph[column] > 0, column] = 1
df_test_paragraph

Unnamed: 0_level_0,Unnamed: 1_level_0,label_id,predict_id_by_lr,predict_id_by_svc,predict_id_by_mnb
id_paragraph,paragraph_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2535.1,Abstract,0,0,0,0
2535.2,Introduction,0,0,0,0
2535.3,The framework,1,0,1,0
2535.4,Inference,0,0,1,0
2535.5,Experimental Results,0,0,0,0
...,...,...,...,...,...
2929.8,Results and Analysis,0,0,0,0
2929.9,Conclusion,0,0,0,0
2930.1,Abstract,0,0,0,0
2930.2,Introduction,0,0,0,0


In [20]:
for column in df_test_paragraph.columns:
    if column != 'label_id':
        print(confusion_matrix(df_test_paragraph['label_id'], df_test_paragraph[column]), '\n')
        print(classification_report(df_test_paragraph['label_id'], df_test_paragraph[column], target_names=["N_PD", "PD"]), '\n')
        print('-'*80,'\n')

[[538   9]
 [ 96  25]] 

              precision    recall  f1-score   support

        N_PD       0.85      0.98      0.91       547
          PD       0.74      0.21      0.32       121

    accuracy                           0.84       668
   macro avg       0.79      0.60      0.62       668
weighted avg       0.83      0.84      0.80       668
 

-------------------------------------------------------------------------------- 

[[479  68]
 [ 52  69]] 

              precision    recall  f1-score   support

        N_PD       0.90      0.88      0.89       547
          PD       0.50      0.57      0.53       121

    accuracy                           0.82       668
   macro avg       0.70      0.72      0.71       668
weighted avg       0.83      0.82      0.82       668
 

-------------------------------------------------------------------------------- 

[[547   0]
 [121   0]] 

              precision    recall  f1-score   support

        N_PD       0.82      1.00      0.90   

  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
# un paragrafo è 'PD' se ha almeno una subsection 'PD'
#'svc' : recall = 0.57, precision = 0.50 -> significa che solo il 57% dei paragrafi con almeno una subsections 'PD' è stato predetto come 'PD' (in quanto aventi al loro interno una subsection predetta come 'PD'),
#                                           mentre dei paragrafi 'PD' (cioè con almeno una subsections predetta come 'PD'), solo il 50% avevano una subsections davvero 'PD'.
#'lr'  : recall = 0.21, precision = 0.74 -> significa che solo il 21% dei paragrafi con almeno una subsections 'PD' è stato predetto come 'PD' (in quanto aventi al loro interno una subsection predetta come 'PD'),
#                                           mentre dei paragrafi 'PD' (cioè con almeno una subsections predetta come 'PD'), solo il 74% avevano una subsections davvero 'PD'.

In [23]:
# visto che in 'svm' la precision è bassa, che paragrafi rileva come 'PD' e che in realtà non lo sono?!
relevant_paragraphs = {'abstract': 0,
                       'introduction': 0,
                       'background': 0,
                       'preliminaries': 0,
                       'motiv': 0,          # e.g. "motivations", "motivating example"
                       #'description': 0,   # e.g. "model description"
                       'overview': 0,       # e.g. "system overview"
                       'problem': 0,        # e.g. "problem definition", "the ... Problem"
                       'application': 0,
                       'scenario': 0,
                       'goal':0,            # e.g. "design goals"
                       'discussion': 0,
                       'work': 0,           # e.g. "future work", "related work"
                       'result': 0,
                       'conclusion': 0,
                       #'experiment': 0,
                       #'architecture': 0,
                       'domain': 0,        # e.g "domain modelling"
                      }

total = 0
for index, row in df_test_paragraph.iterrows():
    if row['predict_id_by_svc'] == 1 and row['label_id'] == 0:
        #print(index)
        total += 1
        for key in relevant_paragraphs.keys():
            if key in index[1].lower():
                relevant_paragraphs[key] += 1
#print('\n\n')
pprint(relevant_paragraphs)
print(sum([i for k,i in relevant_paragraphs.items()]), '/', total)

{'abstract': 3,
 'application': 0,
 'background': 1,
 'conclusion': 4,
 'discussion': 2,
 'domain': 2,
 'goal': 1,
 'introduction': 20,
 'motiv': 1,
 'overview': 1,
 'preliminaries': 1,
 'problem': 0,
 'result': 1,
 'scenario': 0,
 'work': 2}
39 / 68
