In [1]:
import pandas as pd
import re
import unicodedata

In [2]:
df = pd.read_csv('all_papers_data.csv',encoding='unicode_escape')
df_selected = df[['Id','Year','Title','journal_abbr','Abstract', 'Author Keywords','Conference name']]
df_selected = df_selected[df_selected['Author Keywords'].notna()]

# remove Geographical Analysis due to limited data
df_selected = df_selected[df_selected['journal_abbr'] != "GA"]
df_selected = df_selected[df_selected['Abstract'] != '[No abstract available]']
df_selected = df_selected.reset_index(drop=True)

In [3]:
df_selected

Unnamed: 0,Id,Year,Title,journal_abbr,Abstract,Author Keywords,Conference name
0,273,1999,Multisensor remote sensing data for land use/c...,CEUS,Spaceborne radar data have only recently been ...,Land use/cover mapping; Multisensor; Radar; Se...,
1,274,1999,Acquiring transition rules between multiple re...,CEUS,Multi-scale representation is an issue of grow...,Database generalization; Machine learning; Mod...,
2,275,1999,Accessibility analysis and spatial competition...,CEUS,In this article we discuss different methods t...,Accessibility; Facility siting; Geographic inf...,
3,276,1999,Using a genetic algorithm to generate alternat...,CEUS,This paper describes a tool that can assist th...,Genetic algorithm; Multiobjective programming;...,
4,277,1999,Pathways of smart metering development: Shapin...,CEUS,Utility meters are being transformed from simp...,Energy; Environmental innovation; Pathways; Pr...,
...,...,...,...,...,...,...,...
5472,9172,2017,Spheroidal equal angular DEMs: The specificity...,TGIS,Digital elevation models (DEMs) are commonly c...,computation; Digital elevation model; error; g...,
5473,9173,2017,Context inference and prediction modeling in u...,TGIS,The ever-increasing population in cities inten...,asthma; GIS; prediction; reasoning; ubiquitous...,
5474,9260,2019,A web-based visualization tool for exploring s...,TGIS,This article describes an open source web-base...,conflict mapping; land-use planning; multi-cri...,
5475,9322,2020,Extending Processing Toolbox for assessing the...,TGIS,OpenStreetMap (OSM) produces a huge amount of ...,Logical consistency; OpenStreetMap; PyQGIS; QGIS,


In [4]:
def clean_abstract(abstract):
    # Split the abstract into sentences
    sentences = abstract.split('.')
    
    # Remove trailing empty strings
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Publishers and keywords to check
    publishers = ['Elsevier', 'Springer', 'Taylor & Francis', 'Blackwell', 
                  'John Wiley & Sons', 'Copyright', 'All rights reserved', 
                  'OPA', 'Gordon and Breach']
    
    # Remove publisher/copyright lines
    while len(sentences) > 1:
        # Check last sentences for publisher/copyright indicators
        if (sentences[-1].strip().startswith('?') or 
            any(publisher in sentences[-1] for publisher in publishers)):
            sentences.pop()
        elif len(sentences) > 2 and (
            sentences[-2].strip().startswith('?') or 
            any(publisher in sentences[-2] for publisher in publishers)):
            sentences = sentences[:-2]
        elif len(sentences) > 3 and (
            sentences[-3].strip().startswith('?') or 
            any(publisher in sentences[-3] for publisher in publishers)):
            sentences = sentences[:-3]
        else:
            break
    
    # Reconstruct the abstract
    return '.'.join(sentences).strip() + '.'

In [5]:
df_selected['Abstract_Clean'] = df_selected['Abstract'].apply(clean_abstract)
df_selected

Unnamed: 0,Id,Year,Title,journal_abbr,Abstract,Author Keywords,Conference name,Abstract_Clean
0,273,1999,Multisensor remote sensing data for land use/c...,CEUS,Spaceborne radar data have only recently been ...,Land use/cover mapping; Multisensor; Radar; Se...,,Spaceborne radar data have only recently been ...
1,274,1999,Acquiring transition rules between multiple re...,CEUS,Multi-scale representation is an issue of grow...,Database generalization; Machine learning; Mod...,,Multi-scale representation is an issue of grow...
2,275,1999,Accessibility analysis and spatial competition...,CEUS,In this article we discuss different methods t...,Accessibility; Facility siting; Geographic inf...,,In this article we discuss different methods t...
3,276,1999,Using a genetic algorithm to generate alternat...,CEUS,This paper describes a tool that can assist th...,Genetic algorithm; Multiobjective programming;...,,This paper describes a tool that can assist th...
4,277,1999,Pathways of smart metering development: Shapin...,CEUS,Utility meters are being transformed from simp...,Energy; Environmental innovation; Pathways; Pr...,,Utility meters are being transformed from simp...
...,...,...,...,...,...,...,...,...
5472,9172,2017,Spheroidal equal angular DEMs: The specificity...,TGIS,Digital elevation models (DEMs) are commonly c...,computation; Digital elevation model; error; g...,,Digital elevation models (DEMs) are commonly c...
5473,9173,2017,Context inference and prediction modeling in u...,TGIS,The ever-increasing population in cities inten...,asthma; GIS; prediction; reasoning; ubiquitous...,,The ever-increasing population in cities inten...
5474,9260,2019,A web-based visualization tool for exploring s...,TGIS,This article describes an open source web-base...,conflict mapping; land-use planning; multi-cri...,,This article describes an open source web-base...
5475,9322,2020,Extending Processing Toolbox for assessing the...,TGIS,OpenStreetMap (OSM) produces a huge amount of ...,Logical consistency; OpenStreetMap; PyQGIS; QGIS,,OpenStreetMap (OSM) produces a huge amount of ...


In [6]:
def clean_special_ch(text: str) -> str:

    # Normalize unicode characters
    text = unicodedata.normalize('NFKD', text)
    
    # Remove special characters and symbols
    text = re.sub(r'[®™©\u00AE\u2122\u00A9]', '', text)
    
    # Remove combining diacritical marks and invisible characters
    text = re.sub(r'[\u0300-\u036f\u200b-\u200f\u2060-\u2069]', '', text)
    
    # Remove multiple spaces and clean up
    text = re.sub(r'\s+', ' ', text)
    
    # Remove spaces before punctuation
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)
    
    return text.strip()

In [7]:
df_selected['Abstract_Clean_CH'] = df_selected['Abstract_Clean'].apply(clean_special_ch)

In [8]:
df_selected.to_csv('all_papers_clean_abstract.csv',index=False)