## Undersample Training Set

In [1]:
import pandas as pd
import re
import numpy as np

Load Training Set:

In [2]:
dataset_path = "../data/training_set.pkl"
df = pd.read_pickle(dataset_path)
df.head()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,\n Recent work in machine learning for infor...,N_PD
1,2549.2.1,Introduction,\n Information extraction (IE) is the proble...,N_PD
2,2549.2.2,Introduction,\n The increasing importance of the Internet...,N_PD
3,2549.2.3,Introduction,"\n recent work in IE, therefore, has focused...",N_PD
4,2549.2.4,Introduction,"\n At the same time, work on information int...",N_PD


Remove empty text_subsection:

In [3]:
df[df['text_subsection'].isna()]

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
71228,68489.11.57,3.2 Approximate message passing for Gaussian MRF,,N_PD
80133,72319.22.29,Appendix C. Proof of Theorem 4.,,N_PD


In [4]:
df = df.dropna(axis=0, subset=['text_subsection'])
df[df['text_subsection'].isna()]

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection


Info:

In [5]:
print("Subsections in training set = %s" % len(df.id_subsection))
id_paper_set = set()
df.id_subsection.apply(lambda x: id_paper_set.add(x[:x.find('.')]))
print("Valid papers in training set = %s" % len(id_paper_set))

Subsections in training set = 132741
Valid papers in training set = 1433


In [6]:
print("Subsections 'N_PD' in training set = %s" % len(df.loc[df.label_subsection == 'N_PD']))
print("Subsections 'PD' in training set = %s" % len(df.loc[df.label_subsection == 'PD']))

Subsections 'N_PD' in training set = 124727
Subsections 'PD' in training set = 8014


We can see that the training set is not balanced. I apply an undersampling for some subsections of which it is not certain that they are not necessarily 'N_PD'.

In [7]:
%%time
for id_paper in id_paper_set:
    df_subsections_of_paper = df.loc[df['id_subsection'].str.startswith(id_paper+'.')]
    undersampling = False
    for index, row in df_subsections_of_paper.iterrows():
        if row['label_subsection']=='PD':
            if 'problem description' in row['paragraph_name'].lower() or\
            'problem statement' in row['paragraph_name'].lower(): # in paragraph name
                undersampling = False
                break
            else:
                sub_subsection = False
                for match in re.finditer('problem (description|statement)', row['text_subsection'], re.IGNORECASE):
                    if 'roblem description' in match.group(0) or 'roblem statement' in match.group(0): # in text
                        undersampling = True
                    else: # in sub_subsection
                        sub_subsection = True
                        break
                if sub_subsection is True:
                    undersampling = False
                    break
    if undersampling is True:
        indeces_npd = (df_subsections_of_paper.loc[df_subsections_of_paper['label_subsection']=='N_PD']).index
        df.loc[indeces_npd, 'label_subsection'] = np.nan

CPU times: user 1min 3s, sys: 310 ms, total: 1min 3s
Wall time: 1min 3s


In [8]:
df[df['label_subsection'].isna()]

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
85,2625.1.1,Abstract,\n Gridworlds are popular testbeds for plann...,
86,2625.1.2,Abstract,"\n We study a fundamental planning problem, ...",
87,2625.2.1,Introduction,\n Testbeds (prototypical test domains) are ...,
88,2625.2.2,Introduction,"\n In recent years, planning researchers hav...",
89,2625.2.3,Introduction,"\n Copyright c 2000, American Association f...",
...,...,...,...,...
132737,101144.5.1,4. Results and Discussion,"\n For hyper-parameter tuning, the model is ...",
132738,101144.5.2,4. Results and Discussion,"\n Therefore, the value of w is restricted t...",
132739,101144.5.3,4. Results and Discussion,"\n whereas, to ensure that the value of p st...",
132740,101144.6.1,5. Conclusion and Future work,\n A simple binary logistic regression class...,


In [9]:
df = df.dropna(axis=0, subset=['label_subsection'])
df[df['label_subsection'].isna()]

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection


In [10]:
num_pd_subsections = df.loc[df['label_subsection']=='PD'].shape[0]
num_npd_subsections = df.loc[df['label_subsection']=='N_PD'].shape[0]
print("Subsections of 'Problem Description/Statement' in training set = %s" % num_pd_subsections)
print("Other subsections in training set = %s" % num_npd_subsections)

Subsections of 'Problem Description/Statement' in training set = 8014
Other subsections in training set = 88366


Save Training Set:

In [11]:
dataset_path = "./resources/undersampled_training_set.pkl"
df.to_pickle(dataset_path, protocol=4)

In [None]:
#%%time
##count = 0
#for id_paper in id_paper_set:
#    #print("-"*40)
#    #print(id_paper)
#    df_subsections_of_paper = df.loc[df['id_subsection'].str.startswith(id_paper+'.')]
#    #if count <4:
#    #print(df_subsections_of_paper.shape)
#    undersampling = False
#    for index, row in df_subsections_of_paper.iterrows():
#        if row['label_subsection']=='PD':
#            if 'problem description' in row['paragraph_name'].lower() or\
#            'problem statement' in row['paragraph_name'].lower(): # in paragraph name
#                undersampling = False
#                #print("exit1")
#                break
#            else:
#                #print(row['id_subsection'])
#                sub_subsection = False
#                for match in re.finditer('problem (description|statement)', row['text_subsection'], re.IGNORECASE):
#                    if 'roblem description' in match.group(0) or 'roblem statement' in match.group(0): # in text
#                        undersampling = True
#                        #print("-->", match.group(0))
#                    else: # in sub_subsection
#                        #print("-->", match.group(0))
#                        sub_subsection = True
#                        break
#                if sub_subsection is True:
#                    #print("exit2")
#                    undersampling = False
#                    break
#    if undersampling is True:
#        #(df_train.loc[df_subsections_of_paper['label_subsection']=='N_PD'])['label_subsection'] = np.Nan
#        #print('qui:')
#        #print(df_subsections_of_paper.loc[df_subsections_of_paper['label_subsection']=='N_PD'])
#        indeces_npd = (df_subsections_of_paper.loc[df_subsections_of_paper['label_subsection']=='N_PD']).index
#        #print(list(indeces_npd))
#        #print(df.iloc[indeces_npd].shape)
#        #print(df.loc[indeces_npd])
#        #print(len([r['label_subsection'] for _,r in df.loc[indeces_npd].iterrows() if r['label_subsection']=='N_PD']))
#        #print(len([r['label_subsection'] for _,r in df.loc[indeces_npd].iterrows() if r['label_subsection']=='PD']))
#        #print(list(df.loc[indeces_npd][95239]))
#        #df.loc[indeces_npd].label_subsection = df.loc[indeces_npd].label_subsection.replace('N_PD',np.nan)
#        df.loc[indeces_npd, 'label_subsection'] = np.nan
#        #print(df.loc[indeces_npd])
#        #count += 1
#    #else:
#        #break
#print()