In [4]:
!pip install stanfordnlp

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[K     |████████████████████████████████| 158 kB 5.3 MB/s 
Installing collected packages: stanfordnlp
Successfully installed stanfordnlp-0.2.0


In [5]:
!pip install  bleu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import nltk
from nltk.tokenize import sent_tokenize
import re
import stanfordnlp
from bleu import list_bleu

In [10]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [11]:
stanfordnlp.download('en')

Using the default treebank "en_ewt" for language "en".
Would you like to download the models for: en_ewt now? (Y/n)
Y

Default download directory: /root/stanfordnlp_resources
Hit enter to continue or type an alternate directory.


Downloading models for: en_ewt
Download location: /root/stanfordnlp_resources/en_ewt_models.zip


100%|██████████| 235M/235M [00:40<00:00, 5.84MB/s]



Download complete.  Models saved to: /root/stanfordnlp_resources/en_ewt_models.zip
Extracting models file for: en_ewt
Cleaning up...Done.


In [13]:
# Pipeline for the standford model
# We will be using mwt model

stan_nlp = stanfordnlp.Pipeline(processors='tokenize,mwt,pos')

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/root/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Done loading processors!
---


In [15]:
# Input text that we need to make proper with correct capitalization and punctuation.

text = "i think that john stone is a nice guy. there is a stone on the grass. i'm fat. are you welcome and smart in london? is this martin's dog?"

In [19]:
# Splitting of paragraph into sentences by using NLTK tokenize function using punkt

sentences = sent_tokenize(text, language='english')
sentences

['i think that john stone is a nice guy.',
 'there is a stone on the grass.',
 "i'm fat.",
 'are you welcome and smart in london?',
 "is this martin's dog?"]

In [21]:
# Capitalizing the first word for every tokenized sentence

capitalized_sentences = []
for sentence in sentences:
  capitalized_sentences.append(sentence.capitalize())

capitalized_sentences

['I think that john stone is a nice guy.',
 'There is a stone on the grass.',
 "I'm fat.",
 'Are you welcome and smart in london?',
 "Is this martin's dog?"]

In [24]:
#text_true = ' '.join(capitalized_sentences)
#text_true

"I think that john stone is a nice guy. There is a stone on the grass. I'm fat. Are you welcome and smart in london? Is this martin's dog?"

In [22]:
# join the capitalized sentences
text_truecase = re.sub(" (?=[\.,'!?:;])", "", ' '.join(capitalized_sentences))
text_truecase

"I think that john stone is a nice guy. There is a stone on the grass. I'm fat. Are you welcome and smart in london? Is this martin's dog?"

In [28]:
# Capitalizing the words according to part of speech using Stanfordnlp Pipelines

text_doc = stan_nlp(text_truecase)
text_doc

<stanfordnlp.pipeline.doc.Document at 0x7ff9c25b80d0>

In [34]:
# What stanford pipeline does is that split the sentences based on POS (Verb, Proper Noun, etc.)
# https://universaldependencies.org/u/pos/

for sent in text_doc.sentences:
  for token in sent.tokens:
    print(token)

<Token index=1;words=[<Word index=1;text=I;upos=PRON;xpos=PRP;feats=Case=Nom|Number=Sing|Person=1|PronType=Prs>]>
<Token index=2;words=[<Word index=2;text=think;upos=VERB;xpos=VBP;feats=Mood=Ind|Tense=Pres|VerbForm=Fin>]>
<Token index=3;words=[<Word index=3;text=that;upos=SCONJ;xpos=IN;feats=_>]>
<Token index=4;words=[<Word index=4;text=john;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
<Token index=5;words=[<Word index=5;text=stone;upos=PROPN;xpos=NNP;feats=Number=Sing>]>
<Token index=6;words=[<Word index=6;text=is;upos=AUX;xpos=VBZ;feats=Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin>]>
<Token index=7;words=[<Word index=7;text=a;upos=DET;xpos=DT;feats=Definite=Ind|PronType=Art>]>
<Token index=8;words=[<Word index=8;text=nice;upos=ADJ;xpos=JJ;feats=Degree=Pos>]>
<Token index=9;words=[<Word index=9;text=guy;upos=NOUN;xpos=NN;feats=Number=Sing>]>
<Token index=10;words=[<Word index=10;text=.;upos=PUNCT;xpos=.;feats=_>]>
<Token index=1;words=[<Word index=1;text=There;upos=PRON;xpos=EX;f

In [41]:
# Now we need to capitalize the all the Proper Nouns.
# NNS ?

final_words = []
for sent in text_doc.sentences:
  for word in sent.words:
    if word.upos in ['PROPN','NNS']:
      final_words.append(word.text.capitalize())
    else:
      final_words.append(word.text)

final_words

['I',
 'think',
 'that',
 'John',
 'Stone',
 'is',
 'a',
 'nice',
 'guy',
 '.',
 'There',
 'is',
 'a',
 'stone',
 'on',
 'the',
 'grass',
 '.',
 'I',
 "'m",
 'fat',
 '.',
 'Are',
 'you',
 'welcome',
 'and',
 'smart',
 'in',
 'London',
 '?',
 'Is',
 'this',
 'Martin',
 "'s",
 'dog',
 '?']

In [43]:
# Now new can join the word to get the final string

output_sentence = ' '.join(final_words)
output_sentence

"I think that John Stone is a nice guy . There is a stone on the grass . I 'm fat . Are you welcome and smart in London ? Is this Martin 's dog ?"