## Test Set

In [11]:
import pandas as pd
import numpy as np
import re
import pickle

In [5]:
def initial_text_cleaning(text):
    text = text.lower()                                             # transform to lowercase
    text = re.sub(r'\n', '', text)                                  # remove \n
    text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w\b', '', text)                              # remove all single letters
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text


# cleaning text of stop words
from nltk.corpus import stopwords

def remove_stopwords(text, stopwords):
    words = text.split()
    return ' '.join([w for w in words if w not in stopwords])

# cleaning text of nonsense words
from nltk.corpus import words
words_dictionary = set(words.words())
def remove_nonsensewords(text):
    words = text.split()
    return ' '.join([w for w in words if w in words_dictionary])


# stemming and lemmatization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemming(text):
    words = text.split()
    return ' '.join([porter.stem(w) for w in words])

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
def lemmatization(text):
    words = text.split()
    return ' '.join([wordnet.lemmatize(w) for w in words])

In [7]:
## create and save test set
#testset_list = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/testset_list.txt"
#paper_line_list = []
#with open(testset_list, 'r') as in_file:
#    paper_line_list = in_file.readlines()
#
#data_dir = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/papers"  # data/prova data/papers
#df_test = pd.DataFrame(columns=["id_subsection", "paragraph_name", "text_subsection", "label_subsection"])
#for paper_line in paper_line_list:
#    info_paper = paper_line.split("\t\t")
#    path_dir = os.path.join(data_dir, "paper_" + info_paper[0])
#    if os.path.isdir(path_dir):
#        ##analyzable_test_paper_path = os.path.join(path_dir, "paper_" + info_paper[0] + "_analyzable_test.xml")
#        analyzable_paper_path = os.path.join(path_dir, "paper_" + info_paper[0] + "_analyzable.xml")
#        ##if os.path.exists(analyzable_test_paper_path) and os.path.exists(analyzable_paper_path):
#        if os.path.exists(analyzable_paper_path):
#            # 1- parse xml
#            ##tree = ET.parse(analyzable_test_paper_path)
#            tree = ET.parse(analyzable_paper_path)
#            root = tree.getroot()
#            # 2- get all subsections
#            found_subsections = root.findall('.//subsection')
#            for subsection in found_subsections:
#                if '.-1.' in subsection.attrib['id']:
#                    continue
#                else:
#                    paragraph_id = subsection.attrib['id'][:subsection.attrib['id'].rfind('.')]
#                    paragraph = root.find("./paragraph[@id='" + paragraph_id + "']")
#                    if paragraph is None or 'reference' in paragraph.attrib['name'].lower():
#                        continue
#                    # add to dataset
#                    df_test = df_test.append({"id_subsection": subsection.attrib['id'],
#                                            "paragraph_name": paragraph.attrib['name'],
#                                            "text_subsection": subsection.text,
#                                            "label_subsection": np.nan
#                                            }, ignore_index=True)
##print(df_test)
#dataset_path = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/test_set.pkl"  # data/prova data/papers
#df_test.to_pickle(dataset_path)  # to save it


## load test set
dataset_path = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/test_set.pkl"
df_test = pd.read_pickle(dataset_path)   # to read it

df_test = df_test.dropna(subset=['text_subsection'])

# text - cleaning:
df_test['text_subsection'] = df_test['text_subsection'].apply(initial_text_cleaning)

# remove stop-words:
stopwords_file = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/stopwords_list.txt"
stopwords_extended_list = stopwords.words('english')
with open(stopwords_file, 'r') as file:
    stopwords_extended_list.extend([line.replace('\n', '') for line in file.readlines()])
stopwords_extended_list.extend(['table', 'tab', 'figure', 'fig'])
stopwords = set(stopwords_extended_list)
df_test['text_subsection'] = df_test['text_subsection'].apply(lambda x: remove_stopwords(x, stopwords))

# stemming and lemmatization:
df_test['text_subsection'] = df_test['text_subsection'].apply(stemming)
#df_test['text_subsection'] = df_test['text_subsection'].apply(lemmatization)

# remove nonsense-words:
#df_test['text_subsection'] = df_test['text_subsection'].apply(remove_nonsensewords)

In [8]:
df_test[df_test['text_subsection'].isna()]
# deve essere vuota!

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection


In [9]:
df_test['label_id'] = df_test['label_subsection'].factorize()[0]
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,,-1
1,2535.2.1,Introduction,complex sophist current gener industri process...,,-1
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,,-1
3,2535.2.3,Introduction,paper propos differ approach problem model com...,,-1
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,,-1
...,...,...,...,...,...
27114,3050.6.15,Experimental Results,evolut annular prior input imag thumbnail illu...,,-1
27115,3050.6.16,Experimental Results,discuss demonstr approach spatial organ imag u...,,-1
27116,3050.6.17,Experimental Results,author consid system interleav map posit estim...,,-1
27117,3050.6.18,Experimental Results,believ absenc requir posit prior approach suit...,,-1


In [26]:
print("Subsections in test set = %s" % len(df_test.id_subsection))
id_paper_set = set()
df_test.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]))
print("Valid papers in test set = %s" % len(id_paper_set))

Subsections in test set = 27118
Valid papers in test set = 405


Load vectorizer:

In [13]:
vectorizer_path = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/tdidf_bigr"
with open(vectorizer_path, 'rb') as feature_extractor:
    vectorizer = pickle.load(feature_extractor)

In [14]:
X_test = vectorizer.transform(df_test['text_subsection'])
X_test.shape

(27118, 40000)

Load classifier:

In [12]:
classifier_path = "/Users/maurorondina/Desktop/Tesi - Magistrale/data/tdidf_bigr-lr"
with open(classifier_path, 'rb') as training_model:
    model = pickle.load(training_model)

Predict Test Set:

In [24]:
df_test['label_id'] = 1
y_test = df_test['label_id']
print(X_test.shape, y_test.shape)

(27118, 40000) (27118,)


In [19]:
%%time
y_pred = model.predict(X_test)

CPU times: user 3.51 ms, sys: 1.78 ms, total: 5.29 ms
Wall time: 9.01 ms


In [23]:
print(y_pred, len(y_pred))
print(len([i for i in y_pred if i==1]))

[0 0 0 ... 0 0 0] 27118
224


Results:

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[    0     0]
 [26894   224]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.01      0.02     27118

    accuracy                           0.01     27118
   macro avg       0.50      0.00      0.01     27118
weighted avg       1.00      0.01      0.02     27118

0.008260196179659268


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#vedi anche:
# - https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Consumer_complaints.ipynb
# - https://stackabuse.com/text-classification-with-python-and-scikit-learn/

In [28]:
df_test['label_id_predict'] = y_pred
df_test

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id,label_id_predict
0,2535.1.1,Abstract,paper address problem track diagnos complex sy...,,1,0
1,2535.2.1,Introduction,complex sophist current gener industri process...,,1,0
2,2535.2.2,Introduction,want monitor state system reliabl detect abnor...,,1,0
3,2535.2.3,Introduction,paper propos differ approach problem model com...,,1,0
4,2535.2.4,Introduction,express discret failur mode produc discontinuo...,,1,0
...,...,...,...,...,...,...
27114,3050.6.15,Experimental Results,evolut annular prior input imag thumbnail illu...,,1,0
27115,3050.6.16,Experimental Results,discuss demonstr approach spatial organ imag u...,,1,0
27116,3050.6.17,Experimental Results,author consid system interleav map posit estim...,,1,0
27117,3050.6.18,Experimental Results,believ absenc requir posit prior approach suit...,,1,0


In [31]:
df_pd_pred = df_test.loc[df_test['label_id_predict'] == 1]
df_pd_pred

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection,label_id,label_id_predict
26,2535.3.18,The framework,dbn diagnosi goal repres diagnost system type ...,,1,1
258,2539.5.4,Synthesis of Invariants,function extend updat preserv weaken describ f...,,1,1
606,2543.3.9,Anchoring in the literature,problem connect linguist descript object physi...,,1,1
622,2543.4.14,A computational theory of anchoring,definit symbol descript set unari predic,,1,1
631,2543.4.23,A computational theory of anchoring,descript state associ individu symbol descript...,,1,1
...,...,...,...,...,...,...
26371,3040.3.3,Preferred Explanations and Relaxations,defin relax problem definit subset relax probl...,,1,1
26536,3042.3.8,Background,qk consist statement node connect direct arc n...,,1,1
26537,3042.3.9,Background,statement node contain individu piec data qk s...,,1,1
26600,3043.3.1,2 Background and Notation,pomdp defin set state set action set observ tr...,,1,1


In [60]:
for _, row in df_pd_pred.iterrows():
    if len(row['text_subsection'].split(' ')) > 20:
        print(row['id_subsection'] +" :", row['text_subsection'], '\n')

2535.3.18 : dbn diagnosi goal repres diagnost system type describ dbn turn use tcg system blueprintfor skeleton dbn tcg schema system equat describ continu system dynam distinguish type arc tcg tempor arc annot dt nontempor arc variabl incom tempor arc tcg express instantan constraint function predecessor variabl incom tempor arc tcg express tempor constraint 

2539.5.4 : function extend updat preserv weaken describ follow section algorithm identifi candid invari ground instanc true initi state comput start atom schemata predic pi occur problem instanc xi sequenc distinct variabl algorithm goe stage consid oper stage 

2543.3.9 : problem connect linguist descript object physic refer larg studi philosoph linguist tradit fact borrow term anchor situat semant term denot assign variabl individu relat locat tradit provid sourc inspir conceptu anchor problem typic disregard formal comput aspect necessari turn idea techniqu 

2543.5.5 : term framework symbol system given planner individu symb