In [1]:
# packages needed
# !pip install nltk
# !pip install stanfordnlp
# !pip install --upgrade bleu
import nltk
from nltk.tokenize import sent_tokenize
import re
import stanfordnlp
from bleu import list_bleu

import nltk
from nltk.corpus import brown
from nltk.tokenize import sent_tokenize, word_tokenize

nltk.download("brown")
nltk.download("punkt")

[nltk_data] Downloading package brown to /Users/lukas/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lukas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def test_accuracy(truecased_words, original_words, verbose=True):
    correct = sum(
        1 for orig, true in zip(original_words, truecased_words) if orig == true
    )
    accuracy = correct / len(original_words)
    if verbose:
        print(f"Accuracy of casing: {accuracy}")
    return accuracy

In [3]:
# init packages
# nltk.download("punkt")
# nltk.download("averaged_perceptron_tagger")
# stanfordnlp.download("en")
stf_nlp = stanfordnlp.Pipeline(processors="tokenize,mwt,pos")


# function for restoring capitalization
def truecasing(input_text):
    """
    Achieves 86.71% as a measure of the quality of truecasing repair on YELP Kagggle dataset
    """
    # split the text into sentences
    sentences = sent_tokenize(input_text, language="english")
    # capitalize the sentences
    sentences_capitalized = [s.capitalize() for s in sentences]
    # join the capitalized sentences
    text_truecase = re.sub(" (?=[\.,'!?:;])", "", " ".join(sentences_capitalized))
    # capitalize words according to part-of-speech tagging (POS)
    doc = stf_nlp(text_truecase)
    text_truecase = " ".join(
        [
            w.text.capitalize() if w.upos in ["PROPN", "NNS"] else w.text
            for sent in doc.sentences
            for w in sent.words
        ]
    )
    text_truecase = re.sub(r'\s([?.!"](?:\s|$))', r"\1", text_truecase)
    # Remove extra spaces before punctuation marks
    text_truecase = re.sub(r'\s+([?.!-,:;"])', r"\1", text_truecase)
    return text_truecase

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/lukas/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/lukas/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/lukas/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}


Exception: Vector file is not provided.

In [None]:
text = "Some of the banking sector tumult stems from the Fed’s rapid interest rate increases over the past year. Central bankers are expected to lift rates to just above 5 percent this week, up from near-zero as recently as March 2022. After that quick series of adjustments, many lenders are facing losses on older securities and loans, which pay relatively low interest rates compared with newer securities issued in a higher-rate world."
lower = text.lower()

pred = truecasing(text)

print(text)
print(pred)

Some of the banking sector tumult stems from the Fed’s rapid interest rate increases over the past year. Central bankers are expected to lift rates to just above 5 percent this week, up from near-zero as recently as March 2022. After that quick series of adjustments, many lenders are facing losses on older securities and loans, which pay relatively low interest rates compared with newer securities issued in a higher-rate world.
Some of the banking sector tumult stems from the fed ’s rapid interest rate increases over the past year. Central bankers are expected to lift rates to just above 5 percent this week, up from near - zero as recently as March 2022. After that quick series of adjustments, many lenders are facing losses on older securities and loans, which pay relatively low interest rates compared with newer securities issued in a higher - rate world.


In [None]:
import truecase


def truecase_truecase(input_text):
    return truecase.get_true_case(input_text)

In [None]:
truecase_truecase(text)

"I think that John stone is a nice guy . There is a stone on the grass . I'm fat . are you welcome and smart in London? is this Martin's dog?"

In [None]:
text = "Mr. Powell could offer some signal during his news conference, or he could opt to leave the Fed’s options open — which is what some economists expect."
lower = text.lower()
pred = truecase_truecase(lower)
print(pred)

test_accuracy(pred.split(), text.split())

Mr. Powell could offer some signal during his news conference, or he could opt to leave the Fed ’ s options open — which is what some economists expect.
Accuracy of casing: 0.6296296296296297


0.6296296296296297

In [None]:
pred2 = truecasing(lower)
test_accuracy(pred2.split(), text.split())

text truecase is Mr. Powell could offer some signal during his news conference , or he could opt to leave the fed ’s options open — which is what some economists expect .
after sub is Mr. Powell could offer some signal during his news conference , or he could opt to leave the fed ’s options open — which is what some economists expect.
Accuracy of casing: 0.3333333333333333


0.3333333333333333

In [None]:
from nltk.tokenize import sent_tokenize
import re


def truecasing_by_sentence_segmentation(input_text):
    # split the text into sentences
    sentences = sent_tokenize(input_text, language="english")
    # capitalize the sentences
    sentences_capitalized = [s.capitalize() for s in sentences]
    # join the capitalized sentences
    text_truecase = re.sub(" (?=[\.,'!?:;])", "", " ".join(sentences_capitalized))
    return text_truecase


truecasing_by_sentence_segmentation(text)
"I think that john stone is a nice guy. There is a stone on the grass. I'm fat. Are you welcome and smart in london? Is this martin's dog?"

"I think that john stone is a nice guy. There is a stone on the grass. I'm fat. Are you welcome and smart in london? Is this martin's dog?"

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag


def truecasing_by_pos(input_text):
    # tokenize the text into words
    words = nltk.word_tokenize(text)
    # apply POS-tagging on words
    tagged_words = nltk.pos_tag([word.lower() for word in words])
    # apply capitalization based on POS tags
    capitalized_words = [
        w.capitalize() if t in ["NN", "NNS"] else w for (w, t) in tagged_words
    ]
    # capitalize first word in sentence
    capitalized_words[0] = capitalized_words[0].capitalize()
    # join capitalized words
    text_truecase = re.sub(" (?=[\.,'!?:;])", "", " ".join(capitalized_words))
    return text_truecase


truecasing_by_pos(text)

'Mr. Powell could offer some Signal during his News Conference, or he could opt to leave the Fed ’ S Options open — which is what some Economists expect.'