# Natural Language Processing

In [64]:
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /Users/huino01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/huino01/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

## Basic language processing

### Simple title

In [65]:
culture_and_media_tokens = nltk.word_tokenize("Culture & the Media")

In [66]:
print(culture_and_media_tokens)

['Culture', '&', 'the', 'Media']


### part-of-speech tagging

This will be using [Penn Treebank tagset](https://www.sketchengine.eu/penn-treebank-tagset/)

| POS Tag  | Description              |
|----------|--------------------------|
| CC       | coordinating conjunction |
| NNP      | proper noun, singular    |
| DT       | determiner               |

In [67]:
culture_and_media_tokens_tagged = nltk.pos_tag(culture_and_media_tokens)
print(culture_and_media_tokens_tagged)

[('Culture', 'NNP'), ('&', 'CC'), ('the', 'DT'), ('Media', 'NNP')]


### Programming language names parsing

In [68]:
getWordDesc = nltk.word_tokenize("getWordDescription")
print(getWordDesc)

['getWordDescription']


In [69]:
def java_name_split(text: str): 
      
    start_ids = sorted(list(set([0] + [i for i, t in enumerate(text) if t.isupper()] + [len(text)])))
    return [text[x: y] for x, y in zip(start_ids, start_ids[1:])]

print(java_name_split("getWordDescription"))

['get', 'Word', 'Description']


In [70]:
print(java_name_split("NewsArticle"))

['News', 'Article']


In [71]:
print(java_name_split("ABTest"))

['A', 'B', 'Test']
