## Text Cleaning Training Set

In [1]:
import pandas as pd
import nltk
import re

Load Training Set:

In [2]:
dataset_path = "./resources/undersampled_training_set.pkl"
df = pd.read_pickle(dataset_path)
df.head()

Unnamed: 0,id_subsection,paragraph_name,text_subsection,label_subsection
0,2549.1.1,Abstract,\n Recent work in machine learning for infor...,N_PD
1,2549.2.1,Introduction,\n Information extraction (IE) is the proble...,N_PD
2,2549.2.2,Introduction,\n The increasing importance of the Internet...,N_PD
3,2549.2.3,Introduction,"\n recent work in IE, therefore, has focused...",N_PD
4,2549.2.4,Introduction,"\n At the same time, work on information int...",N_PD


In [3]:
num_pd_subsections = df.loc[df['label_subsection']=='PD'].shape[0]
num_npd_subsections = df.loc[df['label_subsection']=='N_PD'].shape[0]
print("Subsections of 'Problem Description/Statement' in training set = %s" % num_pd_subsections)
print("Other subsections in training set = %s" % num_npd_subsections)

Subsections of 'Problem Description/Statement' in training set = 8014
Other subsections in training set = 88366


Text Cleaning:

In [4]:
def initial_text_cleaning(text):
    text = text.lower()                                             # transform to lowercase
    text = re.sub(r'\n', '', text)                                  # remove \n
    text = re.sub(r'(\(|\[|\{)[^(\)|\]|\})]*(\)|\]|\})', '', text)  # remove everything in parentheses
    text = re.sub(r'http(s)?:\/\/\S+', '', text)                    # remove url
    text = re.sub(r'[^a-z\s]', '', text)  #[^\w\s]                  # remove everything that is not a word (therefore also numbers and punctuation)
    text = re.sub(r'\b\w\b', '', text)                              # remove all single letters
    text = re.sub(r'\s{2,}', ' ', text).strip()                     # reformat spaces
    return text


# cleaning text of stop words
from nltk.corpus import stopwords
def remove_stopwords(text, stopwords):
    words = text.split()
    return ' '.join([w for w in words if w not in stopwords])

# cleaning text of nonsense words
from nltk.corpus import words
words_dictionary = set(words.words())
def remove_nonsensewords(text):
    words = text.split()
    return ' '.join([w for w in words if w in words_dictionary])


# stemming and lemmatization
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def stemming(text):
    words = text.split()
    return ' '.join([porter.stem(w) for w in words])

from nltk.stem.wordnet import WordNetLemmatizer
wordnet = WordNetLemmatizer()
def lemmatization(text):
    words = text.split()
    return ' '.join([wordnet.lemmatize(w) for w in words])

In [5]:
%%time
#ex:
#print(df['text_subsection'][0], '\n'+'-'*80+'\n', df['text_subsection'][30])

## text - cleaning:
df['text_subsection'] = df['text_subsection'].apply(initial_text_cleaning)

## remove stop-words:
stopwords_file = "./resources/stopwords_list.txt"
stopwords_extended_list = stopwords.words('english')
with open(stopwords_file, 'r') as file:
    stopwords_extended_list.extend([line.replace('\n', '') for line in file.readlines()])
stopwords_extended_list.extend(['table', 'tab', 'figure', 'fig'])
stopwords = set(stopwords_extended_list)
df['text_subsection'] = df['text_subsection'].apply(lambda x: remove_stopwords(x, stopwords))

#ex:
#print(df['text_subsection'][0], '\n'+'-'*80+'\n', df['text_subsection'][30], '\n'+'='*80+'\n')

## stemming and lemmatization:
df['text_subsection'] = df['text_subsection'].apply(stemming)
#df['text_subsection'] = df['text_subsection'].apply(lemmatization)

#ex:
#print(df['text_subsection'][0], '\n'+'-'*80+'\n', df['text_subsection'][30], '\n'+'='*80+'\n')

## remove nonsense-words:
#df['text_subsection'] = df['text_subsection'].apply(remove_nonsensewords)

#ex:
#print(df['text_subsection'][0], '\n'+'-'*80+'\n', df['text_subsection'][30])

CPU times: user 1min 8s, sys: 419 ms, total: 1min 8s
Wall time: 1min 10s


Save Text-Clean Training Set:

In [6]:
#dataset_path = "./resources/lemmatized_training_set.pkl"
dataset_path = "./resources/stemmed_training_set.pkl"
df.to_pickle(dataset_path, protocol=4)