import libraries

In [2]:
import json
import re
import collections
import typing
import pandas

configure paths

In [3]:
DATA_PATH = "../data/task2/Sustainability_sentences_train.json"

helper functions

In [4]:
def label_str_to_int(text):
    """
    Convet `str` labels to `int`
    """
    if text == "sustainable":
        return 1
    elif text == "unsustainable":
        return 0
    else:
        raise(f"Label `{text}` is nor recognized.")
        
def count_characters(text, char):
    """
    Count how many times a character appears in a text
    """
    counter = 0
    for c in text:
        if c == char:
            counter += 1
    return counter

read in dataset

In [5]:
with open(DATA_PATH, "r") as json_in:
    records = [(ex["sentence"], label_str_to_int(ex["label"])) for ex in json.load(json_in)]

data = pandas.DataFrame.from_records(data=records, columns=["text", "target"])

In [27]:
data.to_csv(
    path_or_buf="../data/task2/data.tsv",
    sep="\t",
    index=False
)

In [6]:
data.shape

(2265, 2)

In [7]:
data["target"].value_counts(normalize=True)

1    0.539956
0    0.460044
Name: target, dtype: float64

In [8]:
data.head()

Unnamed: 0,text,target
0,"For example, our portfolio companies have set ...",1
1,We continue to increase the percentage of rene...,1
2,"Acea Ambiente, the Group Company operating in ...",1
3,Carbon intensity is measured as tons of CO2 pe...,0
4,"More circular, efficient production and transp...",0


In [9]:
data["text"].apply(lambda x: len(x.split())).describe()

count    2265.000000
mean       24.874172
std        12.816577
min         6.000000
25%        17.000000
50%        22.000000
75%        30.000000
max       138.000000
Name: text, dtype: float64

In [10]:
[record[0] for record in records if " & " in record[0].lower()]

['To start, we have developed intermediate Paris-aligned targets to reduce the carbon intensity in our Oil & Gas, Electric Power and Auto Manufacturing portfolios by 2030.',
 'VGL is a Polish asset‑light, freight forwarding and logistics holding comprising of a number of entities operating in several segments of transport & logistic services.',
 'Studies supporting this view include not only the EAT-Lancet study, but also others such as Rockstrom et al., 2020, “Planet- proofing the global food system;” Searchinger et al, 2018, “Creating a sustainable food future: a menu of solutions to feed nearly ten billion people by 2050;” and Gao & Bryan, 2017, “Finding Pathways to national-scale land-sector sustainability.”',
 'As an office-based business, Rothschild & Co does not consider environmental discharges to air, water, soil or indeed noise pollution to be of material environmental risk.',
 'The top social issues of concern were water & food security, inequality and employment security.',

In [11]:
vocab = collections.defaultdict(int)
for sentence in data["text"].values:
    for token in sentence.split():
        vocab[token] += 1

In [12]:
len(vocab)

9353

text cleaning methods
- [x] add space before/after special characters
- [x] add space before/after comma, except when surrounded by digits
- [x] add space before/after period, if it end of string
- [x] lowercasing
- dash surrounded by spaces: remove it
- dash preceeded by token but followed by space: elimnate space
- split possessinve to word + possessive

In [13]:
def remove_final_period(doc: str) -> str: 
    """
    Remove final period character from string
    """
    return doc[:-1] if doc[-1] == "." else doc

def lowercase_text(doc: str) -> str:
    """
    Convert text to lowercase
    """
    return doc.lower()

def add_space_around_string(doc: str, string_list: typing.List[str]) -> str:
    """
    Add a space character around each string in a list of strings
    """
    match = re.compile("(" + "|\\".join(string_list) + ")")
    return match.sub(r" \1 ", doc)

def add_space_around_comma(doc: str) -> str:
    """
    Add a space around commas, unless they are digit separators
    """
    n = len(doc)
    new_doc = ""
    for i, c in enumerate(doc):
        if c != ",":
            new_doc += c
        else:
            if i == 0 or i == n -1 :
                new_doc += ""
            else:
                if doc[i-1].isdigit() and doc[i+1].isdigit():
                    new_doc += c
                else:
                    new_doc += " " + c + " " 
    return new_doc

def transform_doc_with_mapper(doc: str, mapper: typing.Dict[str, str]) -> str:
    """
    Replace substrings in a document according to a predetermined mapping
    """
    for k in mapper:
        match = re.compile(k)
        doc = match.sub(mapper[k], doc)
    return doc

In [25]:
ex = "John - Chris are \xad the best of friends Climate Action 100+"
transform_doc_with_mapper(ex, string_map)

'John Chris are the best of friends CA100++'

In [14]:
add_space_around_comma(", John, our professor, is a rockstar")

' John ,  our professor ,  is a rockstar'

In [192]:
sorted_vocab = sorted([(w, c) for (w,c) in vocab.items()], key=lambda x: x[1], reverse=True)

In [24]:
special_characters = [
    "“",
    "”",
    ":",
    "(",
    ")",
    "[",
    "]",
    ";",
    "?",
    "‘",
    "’",
    ">",
    "\"",
    "€",
    "$"
]

possessive_chars = [
    "’" # multiple special cases here
]

character_map = {
    "–" : "-",
    "§" : "",
    "*" : "",
    "®" : "",
    "–" : "-",
    "—" : "-",
    "‑" : "-",
    "‐" : "-",
}

string_map = {
    "\xad" : "-",
    "Climate Action 100+" : "CA100+",
    " & ": " and ",
    " - " : " ",
    "°C" : " °C ",
}

In [158]:
[
    (w,c) for (w,c) in sorted_vocab 
    if not w.isalnum() 
    and "," not in w # done
    and "." not in w # done
    and "(" not in w # done 
    and ")" not in w # done
    and "%" not in w # done
    and "$" not in w # done
    and "-" not in w
    and "\'" not in w
    and "/" not in w
    and "[" not in w # done
    and "“" not in w # done
    and ":" not in w # done
    and ";" not in w # done
    and "’" not in w # done
    and "–" not in w # done
    and "—" not in w # done
    and "‑" not in w # done
    and "‐" not in w # done
    and "‘" not in w # done
    and "”" not in w # done
    and "?" not in w # done
    and "€" not in w # done
    and "]" not in w # done
    and "\"" not in w # done
    and "\xad" not in w # done
    and "®" not in w # done
    and "&" not in w # done
    and "+" not in w # done
    and "°C" not in w # done
    and ">" not in w # done
]

[('t*km', 1), ('§', 1)]

In [213]:
[
    (w,c) for (w,c) in sorted_vocab 
    if w.isdigit()
][-300:]

[('2', 118),
 ('3', 113),
 ('1', 86),
 ('2020', 50),
 ('2019', 33),
 ('2018', 14),
 ('100', 11),
 ('2030', 11),
 ('2016', 10),
 ('2015', 10),
 ('20', 9),
 ('2021', 9),
 ('5', 9),
 ('15', 9),
 ('2025', 8),
 ('10', 8),
 ('2017', 7),
 ('30', 7),
 ('2014', 6),
 ('70', 5),
 ('21', 5),
 ('4', 5),
 ('80', 5),
 ('60', 5),
 ('2024', 5),
 ('50', 5),
 ('12', 5),
 ('2050', 5),
 ('19', 4),
 ('40', 4),
 ('000', 4),
 ('25', 4),
 ('17', 4),
 ('2013', 4),
 ('2004', 4),
 ('90', 3),
 ('14', 3),
 ('245', 3),
 ('2002', 3),
 ('8', 3),
 ('6', 3),
 ('52', 3),
 ('2010', 3),
 ('2008', 2),
 ('2040', 2),
 ('11', 2),
 ('300', 2),
 ('16', 2),
 ('31', 2),
 ('555', 2),
 ('117', 2),
 ('634', 2),
 ('2006', 2),
 ('2005', 2),
 ('179', 2),
 ('2060', 2),
 ('800', 2),
 ('216', 2),
 ('500', 2),
 ('250', 2),
 ('146', 2),
 ('95', 2),
 ('38', 1),
 ('2007', 1),
 ('29', 1),
 ('67', 1),
 ('109', 1),
 ('77', 1),
 ('439', 1),
 ('84', 1),
 ('2023', 1),
 ('1985', 1),
 ('284', 1),
 ('777', 1),
 ('75', 1),
 ('700', 1),
 ('50001', 1),
 (