# NLTK Examples

## Imports

In [1]:
import nltk

## Utility Functions

In [2]:
def apply_in_list(operation, target_list):
    return list(map(operation,target_list))

def load_file(path):
    with open(path, 'rt') as file_pointer:
        lines = file_pointer.readlines()
        lines = apply_in_list(lambda x: x.strip(), lines)
        return lines

## Reading Data

In [3]:
poems_lines = load_file("../data/raw/poems")
poems_lines

['Theme in Yellow by Carl Sandburg',
 'I spot the hills',
 'With yellow balls in autumn.',
 'I light the prairie cornfields',
 'Orange and tawny gold clusters',
 'And I am called pumpkins.',
 'On the last of October',
 'When dusk is fallen',
 'Children join hands',
 'And circle round me',
 'Singing ghost songs',
 'And love to the harvest moon;',
 'I am...',
 '',
 'October by May Swenson',
 'A smudge for the horizon',
 'that, on a clear day, shows',
 'the hard edge of hills and',
 'buildings on the other coast.',
 'Anchored boats all head one way:',
 'north, where the wind comes from.',
 'You can see the storm inflating',
 'out of the west. A dark hole',
 'in...']

## Tokenizer

### Example 1

In [4]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-z]")

tokenized_lines_1 = apply_in_list(lambda x: tokenizer.tokenize(x), poems_lines)
tokenized_lines_1

[['h',
  'e',
  'm',
  'e',
  'i',
  'n',
  'e',
  'l',
  'l',
  'o',
  'w',
  'b',
  'y',
  'a',
  'r',
  'l',
  'a',
  'n',
  'd',
  'b',
  'u',
  'r',
  'g'],
 ['s', 'p', 'o', 't', 't', 'h', 'e', 'h', 'i', 'l', 'l', 's'],
 ['i',
  't',
  'h',
  'y',
  'e',
  'l',
  'l',
  'o',
  'w',
  'b',
  'a',
  'l',
  'l',
  's',
  'i',
  'n',
  'a',
  'u',
  't',
  'u',
  'm',
  'n'],
 ['l',
  'i',
  'g',
  'h',
  't',
  't',
  'h',
  'e',
  'p',
  'r',
  'a',
  'i',
  'r',
  'i',
  'e',
  'c',
  'o',
  'r',
  'n',
  'f',
  'i',
  'e',
  'l',
  'd',
  's'],
 ['r',
  'a',
  'n',
  'g',
  'e',
  'a',
  'n',
  'd',
  't',
  'a',
  'w',
  'n',
  'y',
  'g',
  'o',
  'l',
  'd',
  'c',
  'l',
  'u',
  's',
  't',
  'e',
  'r',
  's'],
 ['n',
  'd',
  'a',
  'm',
  'c',
  'a',
  'l',
  'l',
  'e',
  'd',
  'p',
  'u',
  'm',
  'p',
  'k',
  'i',
  'n',
  's'],
 ['n',
  't',
  'h',
  'e',
  'l',
  'a',
  's',
  't',
  'o',
  'f',
  'c',
  't',
  'o',
  'b',
  'e',
  'r'],
 ['h', 'e', 'n', 'd', 'u', '

### Example 2

In [5]:
tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z0-9]+")

tokenized_lines_2 = apply_in_list(lambda x: tokenizer.tokenize(x), poems_lines)
tokenized_lines_2

[['Theme', 'in', 'Yellow', 'by', 'Carl', 'Sandburg'],
 ['I', 'spot', 'the', 'hills'],
 ['With', 'yellow', 'balls', 'in', 'autumn'],
 ['I', 'light', 'the', 'prairie', 'cornfields'],
 ['Orange', 'and', 'tawny', 'gold', 'clusters'],
 ['And', 'I', 'am', 'called', 'pumpkins'],
 ['On', 'the', 'last', 'of', 'October'],
 ['When', 'dusk', 'is', 'fallen'],
 ['Children', 'join', 'hands'],
 ['And', 'circle', 'round', 'me'],
 ['Singing', 'ghost', 'songs'],
 ['And', 'love', 'to', 'the', 'harvest', 'moon'],
 ['I', 'am'],
 [],
 ['October', 'by', 'May', 'Swenson'],
 ['A', 'smudge', 'for', 'the', 'horizon'],
 ['that', 'on', 'a', 'clear', 'day', 'shows'],
 ['the', 'hard', 'edge', 'of', 'hills', 'and'],
 ['buildings', 'on', 'the', 'other', 'coast'],
 ['Anchored', 'boats', 'all', 'head', 'one', 'way'],
 ['north', 'where', 'the', 'wind', 'comes', 'from'],
 ['You', 'can', 'see', 'the', 'storm', 'inflating'],
 ['out', 'of', 'the', 'west', 'A', 'dark', 'hole'],
 ['in']]

## Stemming

### Parsing Data

In [6]:
poems_lines = apply_in_list(lambda x: x.lower(), poems_lines)

tokenizer = nltk.tokenize.RegexpTokenizer("[a-zA-Z0-9]+")

tokenized_lines = apply_in_list(lambda x: tokenizer.tokenize(x), poems_lines)
tokenized_lines

[['theme', 'in', 'yellow', 'by', 'carl', 'sandburg'],
 ['i', 'spot', 'the', 'hills'],
 ['with', 'yellow', 'balls', 'in', 'autumn'],
 ['i', 'light', 'the', 'prairie', 'cornfields'],
 ['orange', 'and', 'tawny', 'gold', 'clusters'],
 ['and', 'i', 'am', 'called', 'pumpkins'],
 ['on', 'the', 'last', 'of', 'october'],
 ['when', 'dusk', 'is', 'fallen'],
 ['children', 'join', 'hands'],
 ['and', 'circle', 'round', 'me'],
 ['singing', 'ghost', 'songs'],
 ['and', 'love', 'to', 'the', 'harvest', 'moon'],
 ['i', 'am'],
 [],
 ['october', 'by', 'may', 'swenson'],
 ['a', 'smudge', 'for', 'the', 'horizon'],
 ['that', 'on', 'a', 'clear', 'day', 'shows'],
 ['the', 'hard', 'edge', 'of', 'hills', 'and'],
 ['buildings', 'on', 'the', 'other', 'coast'],
 ['anchored', 'boats', 'all', 'head', 'one', 'way'],
 ['north', 'where', 'the', 'wind', 'comes', 'from'],
 ['you', 'can', 'see', 'the', 'storm', 'inflating'],
 ['out', 'of', 'the', 'west', 'a', 'dark', 'hole'],
 ['in']]

### Porter Stemming

In [7]:
stemmer = nltk.stem.PorterStemmer()

In [11]:
stemmer.stem("has")

'ha'

In [12]:
for tokenized_line in tokenized_lines:
    print(apply_in_list(stemmer.stem, tokenized_line))

['theme', 'in', 'yellow', 'by', 'carl', 'sandburg']
['i', 'spot', 'the', 'hill']
['with', 'yellow', 'ball', 'in', 'autumn']
['i', 'light', 'the', 'prairi', 'cornfield']
['orang', 'and', 'tawni', 'gold', 'cluster']
['and', 'i', 'am', 'call', 'pumpkin']
['on', 'the', 'last', 'of', 'octob']
['when', 'dusk', 'is', 'fallen']
['children', 'join', 'hand']
['and', 'circl', 'round', 'me']
['sing', 'ghost', 'song']
['and', 'love', 'to', 'the', 'harvest', 'moon']
['i', 'am']
[]
['octob', 'by', 'may', 'swenson']
['a', 'smudg', 'for', 'the', 'horizon']
['that', 'on', 'a', 'clear', 'day', 'show']
['the', 'hard', 'edg', 'of', 'hill', 'and']
['build', 'on', 'the', 'other', 'coast']
['anchor', 'boat', 'all', 'head', 'one', 'way']
['north', 'where', 'the', 'wind', 'come', 'from']
['you', 'can', 'see', 'the', 'storm', 'inflat']
['out', 'of', 'the', 'west', 'a', 'dark', 'hole']
['in']


### Snowball Stemming

In [13]:
stemmer = nltk.stem.SnowballStemmer('english', ignore_stopwords=True)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Searched in:
    - '/home/miranda/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/home/miranda/Documents/mestrado/monitoria/python_class/demonstrations/venv/nltk_data'
    - '/home/miranda/Documents/mestrado/monitoria/python_class/demonstrations/venv/share/nltk_data'
    - '/home/miranda/Documents/mestrado/monitoria/python_class/demonstrations/venv/lib/nltk_data'
**********************************************************************


In [None]:
stemmer.stem("women")

In [None]:
for tokenized_line in tokenized_lines:
    print(apply_in_list(stemmer.stem, tokenized_line))

# Gensim Examples

## Imports

In [14]:
import gensim

In [15]:
new_york_corpus = load_file("../data/raw/new_york")
new_york_corpus = apply_in_list(lambda x: tokenizer.tokenize(x.lower()), new_york_corpus)
new_york_corpus

[['the',
  'city',
  'of',
  'new',
  'york',
  'often',
  'called',
  'new',
  'york',
  'city',
  'nyc',
  'or',
  'simply',
  'new',
  'york',
  'ny',
  'is',
  'the',
  'most',
  'populous',
  'city',
  'in',
  'the',
  'united',
  'states',
  'with',
  'an',
  'estimated',
  '2017',
  'population',
  'of',
  '8',
  '622',
  '698',
  'distributed',
  'over',
  'a',
  'land',
  'area',
  'of',
  'about',
  '302',
  '6',
  'square',
  'miles',
  '784',
  'km2',
  'new',
  'york',
  'city',
  'is',
  'also',
  'the',
  'most',
  'densely',
  'populated',
  'major',
  'city',
  'in',
  'the',
  'united',
  'states',
  'located',
  'at',
  'the',
  'southern',
  'tip',
  'of',
  'the',
  'state',
  'of',
  'new',
  'york',
  'the',
  'city',
  'is',
  'the',
  'center',
  'of',
  'the',
  'new',
  'york',
  'metropolitan',
  'area',
  'the',
  'largest',
  'metropolitan',
  'area',
  'in',
  'the',
  'world',
  'by',
  'urban',
  'landmass',
  'and',
  'one',
  'of',
  'the',
  'world',

## Phrases and Phraser

In [16]:
bigram_new_york_phrases = gensim.models.phrases.Phrases(new_york_corpus)
bigram_new_york_phrases

<gensim.models.phrases.Phrases at 0x7f101a599358>

In [26]:
bigram_new_york_phrases[['the', 'city', 'of', 'new', 'york']]

['the', 'city', 'of', 'new_york']

In [20]:
bigram_new_york_phrases[new_york_corpus]

<gensim.interfaces.TransformedCorpus at 0x7f101a5adb00>

## Word2Vec

In [21]:
bigram_new_york_corpus = bigram_new_york_phrases[new_york_corpus]

In [22]:
new_york_word_2_vec = gensim.models.word2vec.Word2Vec(bigram_new_york_corpus, workers=4, sg=1)

In [24]:
new_york_word_2_vec.wv.similar_by_vector("new_york")

[('the', 0.8539616465568542),
 ('in', 0.7426537275314331),
 ('city', 0.7309365272521973),
 ('world', 0.7116335034370422),
 ('and', 0.6912716627120972),
 ('of', 0.6751328110694885),
 ('a', 0.6435642242431641),
 ('area', 0.6405653357505798),
 ('metropolitan', 0.6210491061210632),
 ('s', 0.6185368299484253)]

In [25]:
new_york_word_2_vec.wv['new_york']

array([ 0.00072319,  0.00551608, -0.0093502 ,  0.00344164, -0.0035011 ,
       -0.00381537,  0.00027762,  0.0038802 ,  0.00937217,  0.00313875,
        0.00657089,  0.0069883 , -0.00163839,  0.01002916,  0.0048383 ,
        0.00850541, -0.00472094,  0.00202631,  0.00888059, -0.00606367,
        0.00592949,  0.00377793,  0.0090571 ,  0.00723883, -0.00571491,
        0.01253568,  0.0057293 , -0.00930411, -0.00547404, -0.00225993,
        0.00558491,  0.00621172, -0.00781286, -0.00046917, -0.00426969,
        0.00259661,  0.00475539, -0.00086822,  0.00091551, -0.00057263,
        0.00797326,  0.00142464, -0.00390889,  0.00677817, -0.00465158,
        0.00238811,  0.00083496, -0.00154922, -0.00367749, -0.0071643 ,
       -0.01009379, -0.00652849,  0.00706138,  0.00173057,  0.00513783,
       -0.00229316, -0.0013963 , -0.00347413, -0.00186482, -0.00654995,
        0.00410566,  0.01207789, -0.00821588,  0.00100448,  0.00273469,
        0.00863728, -0.00143037,  0.00056936, -0.00221569, -0.00