In [1]:
import os
os.chdir(os.getcwd() + '/../../')

In [2]:
import nltk

from scripts.utils import *

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
words = 'The Brown fox jumps over the lazy dog'.split()
nltk.pos_tag(words)

[('The', 'DT'),
 ('Brown', 'NNP'),
 ('fox', 'NN'),
 ('jumps', 'NNS'),
 ('over', 'IN'),
 ('the', 'DT'),
 ('lazy', 'JJ'),
 ('dog', 'NN')]

### Splitting into sentences

In [6]:
text = 'The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress.'

In [7]:
nltk.download('punkt')
splitter = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PendragonS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [8]:
sentences = splitter.tokenize(text)
sentences

['The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018.',
 'In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January.',
 'When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress.']

### Word tokenizing

In [14]:
from nltk import word_tokenize

tokens_all = [word_tokenize(sentence) for sentence in sentences]

print(tokens_all[0])

['The', 'head', 'of', 'a', 'conservative', 'Republican', 'faction', 'in', 'the', 'U.S.', 'Congress', ',', 'who', 'voted', 'this', 'month', 'for', 'a', 'huge', 'expansion', 'of', 'the', 'national', 'debt', 'to', 'pay', 'for', 'tax', 'cuts', ',', 'called', 'himself', 'a', '“', 'fiscal', 'conservative', '”', 'on', 'Sunday', 'and', 'urged', 'budget', 'restraint', 'in', '2018', '.']


### WordNet  tagging

In [15]:
tags_all = []
for tokens in tokens_all:
    sentence_tags = nltk.pos_tag([token.lower() for token in tokens])
    tags_all.append(sentence_tags)

print(tags_all[0])

[('the', 'DT'), ('head', 'NN'), ('of', 'IN'), ('a', 'DT'), ('conservative', 'JJ'), ('republican', 'JJ'), ('faction', 'NN'), ('in', 'IN'), ('the', 'DT'), ('u.s.', 'JJ'), ('congress', 'NN'), (',', ','), ('who', 'WP'), ('voted', 'VBD'), ('this', 'DT'), ('month', 'NN'), ('for', 'IN'), ('a', 'DT'), ('huge', 'JJ'), ('expansion', 'NN'), ('of', 'IN'), ('the', 'DT'), ('national', 'JJ'), ('debt', 'NN'), ('to', 'TO'), ('pay', 'VB'), ('for', 'IN'), ('tax', 'NN'), ('cuts', 'NNS'), (',', ','), ('called', 'VBD'), ('himself', 'PRP'), ('a', 'DT'), ('“', 'JJ'), ('fiscal', 'JJ'), ('conservative', 'JJ'), ('”', 'NN'), ('on', 'IN'), ('sunday', 'NN'), ('and', 'CC'), ('urged', 'VBD'), ('budget', 'NN'), ('restraint', 'NN'), ('in', 'IN'), ('2018', 'CD'), ('.', '.')]


In [16]:
wordnet_tags_all = []
for tags in tags_all:
    wordnet_tags = [(tag[0], convert_to_wordnet_tag(tag[1])) for tag in tags]
    wordnet_tags_all.append(wordnet_tags)

print(wordnet_tags_all[0])

[('the', 'n'), ('head', 'n'), ('of', 'n'), ('a', 'n'), ('conservative', 'a'), ('republican', 'a'), ('faction', 'n'), ('in', 'n'), ('the', 'n'), ('u.s.', 'a'), ('congress', 'n'), (',', 'n'), ('who', 'n'), ('voted', 'v'), ('this', 'n'), ('month', 'n'), ('for', 'n'), ('a', 'n'), ('huge', 'a'), ('expansion', 'n'), ('of', 'n'), ('the', 'n'), ('national', 'a'), ('debt', 'n'), ('to', 'n'), ('pay', 'v'), ('for', 'n'), ('tax', 'n'), ('cuts', 'n'), (',', 'n'), ('called', 'v'), ('himself', 'n'), ('a', 'n'), ('“', 'a'), ('fiscal', 'a'), ('conservative', 'a'), ('”', 'n'), ('on', 'n'), ('sunday', 'n'), ('and', 'n'), ('urged', 'v'), ('budget', 'n'), ('restraint', 'n'), ('in', 'n'), ('2018', 'n'), ('.', 'n')]


### Lemmatization

In [20]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
lemmatized_sentences = []
for tags in wordnet_tags_all:
    lemmatized_sentence = ' '.join([lemmatizer.lemmatize(tag[0], pos=tag[1]) for tag in tags])
    lemmatized_sentences.append(lemmatized_sentence)
    
lemmatized_sentences[0]

'the head of a conservative republican faction in the u.s. congress , who vote this month for a huge expansion of the national debt to pay for tax cut , call himself a “ fiscal conservative ” on sunday and urge budget restraint in 2018 .'

### Sentences to document

In [22]:
lemmatized_doc = ' '.join(lemmatized_sentences)

lemmatized_doc

'the head of a conservative republican faction in the u.s. congress , who vote this month for a huge expansion of the national debt to pay for tax cut , call himself a “ fiscal conservative ” on sunday and urge budget restraint in 2018 . in keep with a sharp pivot under way among republican , u.s. representative mark meadow , speak on cbs ’ “ face the nation , ” draw a hard line on federal spending , which lawmaker be brace to do battle over in january . when they return from the holiday on wednesday , lawmaker will begin try to pass a federal budget in a fight likely to be link to other issue , such a immigration policy , even a the november congressional election campaign approach in which republican will seek to keep control of congress .'