In [2]:
import os
import re
import nltk
from pprint import pprint
from pathlib import Path
import numpy as np

import pandas as pd
from pprint import pprint

#by this option we disable the possibility of a bad dataframe representation
pd.set_option('display.expand_frame_repr', False)

In [3]:
class SGDParser:
    
    #constructor class
    def __init__(self):
        pass
    
    #parse all the txt files - static function
    @staticmethod
    def get_sdg_files():
        return list(Path('test_set').glob('**/*.txt'))
    
    
    #reads the file and insert its lines into a list
    @staticmethod
    def get_lines(fname: Path):
        """

        :param fname:
        :return:
        """
        with open(fname, 'r', encoding='utf-8-sig') as infile:
            lines = list(map(str.strip, infile.readlines()))

        return lines
    
    #extracts the sdg by splitting the path in string, insert the strings in the list and get the second last string 
    #which is the information about the sdg
    @staticmethod
    def extract_sdg(fpath):
        filename = str(fpath).split(os.sep)[-1]
        fields = filename.split("_")
        sdg_test = fields[1]
        if sdg_test == 'G1B2': 
            sdg = 'SDG10'
        elif sdg_test == 'G1B3':
            sdg = 'SDG12'
        elif sdg_test in ('G2B1','G2B2', 'G2B3','G4B1', 'G4B2', 'G4B3'):
            sdg = 'SDG5'
        elif sdg_test == 'G1B1':
            sdg = 'SDG7'
        elif sdg_test in ('G3B1', 'G3B2', 'G3B3', 'G3B4', 'G6B1', 'G6B2', 'G6B3', 'G6B4'):
            sdg = 'SDG3'
        else: 
            sdg = 'SDG13'
        return sdg
    
    #function that returns keywords related with the abstract
    @property
    def keywords(self):
        """

        :return:
        """
        return sorted(
            set(map(str.lower, ['abstarct', 'abstract', 'abstratc', 'aim',
                                'background', 'introduction', 'objective',
                                'purpose']
                    )
                )
        )
    
    #function used to lowercase the words in the txt lines
    @staticmethod
    def lines_2_lowercase(lines):
        """

        :param lines:
        :return:
        """
        return list(map(str.lower, lines))
    
    #function to remove the 'Title:' in the txt files
    @staticmethod
    def remove_title_prefix(title):
        """

        :param title:
        :return:
        """
        if title and title.startswith('Title:'):
            return re.sub('Title:', '', title).strip()

        return title
    
    @staticmethod
    def remove_abstract_prefix(abstract):
        """

        :param title:
        :return:
        """
        if abstract and abstract.startswith('Abstract: '):
            return re.sub('Abstract: ', '', abstract).strip()

        return abstract
    
    #function to join the title in case of more than 1 lines and to join the abstract lines into one by searching keywords
    def search_with_keyword(self, term, lowered_lines, lines, index):
        """

        :param term:
        :param lowered_lines:
        :param lines:
        :param index:
        :return:
        """
        doc = dict()
        matched = False

        if lowered_lines[index].startswith(term):
            doc['extracted_title'] = self.join_lines(lines[:index])
            doc['extracted_abstract'] = self.join_lines(lines[index:])
            matched = True

        return doc, matched
    
    #function to separate each line with ||
    @staticmethod
    def join_lines(lines):
        return ' || '.join(lines)

    #function to join the title in case of more than 1 lines and to join the abstract lines into one by searching in lines
    def search_in_lines(self, lowered_lines, lines, index):
        """

        :param lowered_lines:
        :param lines:
        :param index:
        :return:
        """
        doc = dict()
        matched = False

        for word in self.keywords:
            if lowered_lines[index].startswith(word):
                doc['extracted_title'] = self.join_lines(lines[:index])
                doc['extracted_abstract'] = self.join_lines(lines[index:])
                matched = True
                break

        return doc, matched

    #main function of the class - create the dataset by using the above functions
    def create_sdg_dataset(self):
        """

        :return:
        """
        files = self.get_sdg_files()
        
        data = list()

        for file in files:

            lines = self.get_lines(file)

            matched = False

            lowered_lines = self.lines_2_lowercase(lines)

            doc = {'sdg': self.extract_sdg(file),
                    'extracted_title': None,
                   'extracted_abstract': None,
                   'initial_text': ' || '.join(lines)}

            if 'abstract' in lowered_lines:
                abs_index = lowered_lines.index('abstract')
                title_lines = lines[:abs_index]
                abstract_lines = lines[abs_index + 1:]

                doc['extracted_title'] = self.join_lines(title_lines)
                doc['extracted_abstract'] = self.join_lines(abstract_lines)
                matched = True

            if not matched:
                temp_doc, matched = self.search_in_lines(
                    lowered_lines, lines, 1)
                doc.update(temp_doc)

            if not matched:
                temp_doc, matched = self.search_in_lines(
                    lowered_lines, lines, 2)
                doc.update(temp_doc)

            if not matched:
                temp_doc, matched = self.search_in_lines(
                    lowered_lines, lines, 3)
                doc.update(temp_doc)

            key_phrases = ['this study', 'in this study', 'the study',
                           'this paper', 'in this paper', 'this research',
                           'the objective', 'the aim', 'this report',
                           'the report', 'the purpose of', 'the paper']

            for key_phrase in key_phrases:
                if not matched:
                    temp_doc, matched = self.search_with_keyword(
                        key_phrase, lowered_lines, lines, 1)
                    doc.update(temp_doc)

            for key_phrase in key_phrases:
                if not matched:
                    temp_doc, matched = self.search_with_keyword(
                        key_phrase, lowered_lines, lines, 2)
                    doc.update(temp_doc)

            if not matched:
                if lowered_lines[0].startswith('title:'):
                    doc['extracted_title'] = lines[0]
                    doc['extracted_abstract'] = self.join_lines(lines[1:])
                    

                else:
                    doc['extracted_title'] = lines[0]
                    doc['extracted_abstract'] = self.join_lines(lines[1:])
                    

            doc['extracted_title'] = self.remove_title_prefix(
                doc['extracted_title'])
            
            doc['extracted_abstract'] = self.remove_abstract_prefix(
                doc['extracted_abstract'])

            data.append(doc)

        return data


if __name__ == "__main__":
    parser = SGDParser()
    validation_df = pd.DataFrame(parser.create_sdg_dataset())
    print(validation_df)


      sdg                                    extracted_title                                 extracted_abstract                                       initial_text
0    SDG7  Solid oxide fuel cell hybrid system: A detaile...  This paper reports a review of an environmenta...  Solid oxide fuel cell hybrid system: A detaile...
1   SDG10  How do oil prices, macroeconomic factors and p...  The aim of this study is to determine the natu...  How do oil prices, macroeconomic factors and p...
2   SDG10  A comprehensive indicator set for measuring mu...  In this paper, we develop a quantitative indic...  A comprehensive indicator set for measuring mu...
3   SDG10  Foreign direct investment and renewable energy...  Climate change mitigation is a topical issue w...  Foreign direct investment and renewable energy...
4   SDG10  Energy storage and multi energy systems in loc...  This study investigates how a district with a ...  Energy storage and multi energy systems in loc...
..    ...             

In [4]:
validation_df[:20]

Unnamed: 0,sdg,extracted_title,extracted_abstract,initial_text
0,SDG7,Solid oxide fuel cell hybrid system: A detaile...,This paper reports a review of an environmenta...,Solid oxide fuel cell hybrid system: A detaile...
1,SDG10,"How do oil prices, macroeconomic factors and p...",The aim of this study is to determine the natu...,"How do oil prices, macroeconomic factors and p..."
2,SDG10,A comprehensive indicator set for measuring mu...,"In this paper, we develop a quantitative indic...",A comprehensive indicator set for measuring mu...
3,SDG10,Foreign direct investment and renewable energy...,Climate change mitigation is a topical issue w...,Foreign direct investment and renewable energy...
4,SDG10,Energy storage and multi energy systems in loc...,This study investigates how a district with a ...,Energy storage and multi energy systems in loc...
5,SDG10,A review on alternative fuels in future energy...,Transition and decarbonization of the energy s...,A review on alternative fuels in future energy...
6,SDG12,Resistance in rejecting solid fuels: Beyond av...,Solid fuels are the primary cooking fuels in a...,Resistance in rejecting solid fuels: Beyond av...
7,SDG12,Perpetuating energy poverty: Assessing roadmap...,A growing number of people in Africa still do ...,Perpetuating energy poverty: Assessing roadmap...
8,SDG12,Holistic approach for sustainability enhancing...,A novel approach for sustainability enhancemen...,Holistic approach for sustainability enhancing...
9,SDG12,Increasing self-consumption of renewable energ...,This paper focuses on a novel energy managemen...,Increasing self-consumption of renewable energ...


In [5]:
validation_df.to_csv('validation_dataset.csv', encoding='utf-8', index=False)

### Prepare the dataset for evaluation with the created methods  

In [6]:
#import english stopwords - code retrieved from Lab 1
nltk.download(u'stopwords')

from nltk.corpus import stopwords
en_stop = stopwords.words('english')

en_stop

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arisp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [7]:
#remove frequent words that do not contribute to classification
remove_words = ['also','use','using','uses','used','among','analysis','data', 'study', 'results']
pat = r'\b(?:{})\b'.format('|'.join(remove_words))

In [8]:
#create a function for preprocessing our data 
def preprocess_df(df_col):
    df_col = re.sub(r'\d+', '', df_col.lower())
    df_col = re.sub('[^a-z\s]', '',df_col)
    df_col = re.sub(pat,'',df_col)
    df_col = [w for w in df_col.split() if w not in set(en_stop)]
    return ' '.join(df_col)

#apply the above function in the dataframe columns we are interested to
validation_df['extracted_title'] = validation_df['extracted_title'].apply(preprocess_df)
validation_df['extracted_abstract'] = validation_df['extracted_abstract'].apply(preprocess_df)
validation_df['initial_text'] = validation_df['initial_text'].apply(preprocess_df)
validation_df

Unnamed: 0,sdg,extracted_title,extracted_abstract,initial_text
0,SDG7,solid oxide fuel cell hybrid system detailed r...,paper reports review environmentally clean eff...,solid oxide fuel cell hybrid system detailed r...
1,SDG10,oil prices macroeconomic factors policies affe...,aim determine nature relationship renewable en...,oil prices macroeconomic factors policies affe...
2,SDG10,comprehensive indicator set measuring multiple...,paper develop quantitative indicator approach ...,comprehensive indicator set measuring multiple...
3,SDG10,foreign direct investment renewable energy cli...,climate change mitigation topical issue growin...,foreign direct investment renewable energy cli...
4,SDG10,energy storage multi energy systems local ener...,investigates district high capacity noncontrol...,energy storage multi energy systems local ener...
...,...,...,...,...
94,SDG13,dual stresses flooding agricultural land reduc...,global climate change leading significant incr...,dual stresses flooding agricultural land reduc...
95,SDG13,trend extreme rainfall events suitable global ...,kolkata metropolitan area kma one densely popu...,trend extreme rainfall events suitable global ...
96,SDG13,grounding line ice frontal position coastal ic...,past years satellite remote sensing captured s...,grounding line ice frontal position coastal ic...
97,SDG13,environment stupid values motivations routes e...,exploratory mixedmethods indepth interviews in...,environment stupid values motivations routes e...


In [9]:
validation_df.to_csv('validation_dataset_v2.csv', encoding='utf-8', index=False)