In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import StanfordNERTagger
import os
import pandas as pd

In [30]:
header = "Google announces takeover of Microsoft"

tokens = nltk.word_tokenize(header)
tokens

['Google', 'announces', 'takeover', 'of', 'Microsoft']

In [31]:
tagged = nltk.pos_tag(tokens)
tagged

[('Google', 'NNP'),
 ('announces', 'VBZ'),
 ('takeover', 'NN'),
 ('of', 'IN'),
 ('Microsoft', 'NNP')]

In [32]:
phrases = sent_tokenize(header)
words = word_tokenize(header)
 
print(phrases)
print(words)

['Google announces takeover of Microsoft']
['Google', 'announces', 'takeover', 'of', 'Microsoft']


In [33]:
stopWords = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stopWords]
 
print(filtered_words)

['Google', 'announces', 'takeover', 'Microsoft']


In [34]:
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

googl
announc
takeov
of
microsoft


In [35]:
# Francesco's settings
# java_path = "/Users/macbookpro/Downloads/jdk-13.0.1.jdk/Contents/Home/bin/java"

# Enter your username in here. I've put the paths you wrote as an attribute in the dictionary below
user = 'Conor'

# Dictionary containing the Java paths for each user
java_paths = {
    'Conor': '/usr/lib/jvm/java-8-openjdk-amd64',
    'Francesco': '/Users/macbookpro/Downloads/jdk-13.0.1.jdk/Contents/Home/bin/java',
    'Cecilia': 'C:/Program Files/Java/jdk-13.0.1/bin/java.exe'
}

os.environ['JAVAHOME'] = java_paths[user]

st = StanfordNERTagger(
    'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2014-06-16/stanford-ner.jar',
    encoding = 'utf-8'
)

# Previous settings - updated to refer to version as in the GitHub repo so this should run for everyone
# st = StanfordNERTagger('/Users/macbookpro/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
#                           '/Users/macbookpro/Downloads/stanford-ner-2014-06-16/stanford-ner.jar',
#                         encoding='utf-8')

In [36]:
# -*- coding: utf-8 -*-
tokenized_text = word_tokenize(header)
classified_text = st.tag(tokenized_text)

print('Tokenized Text:')
for el in tokenized_text:
    print(el)

print('\nClassified Text:')
for el in classified_text:
    print(el)

Tokenized Text:
Google
announces
takeover
of
Microsoft

Classified Text:
('Google', 'ORGANIZATION')
('announces', 'O')
('takeover', 'O')
('of', 'O')
('Microsoft', 'ORGANIZATION')


In [37]:
# Bit python magic to separate the list of tuples into two separate lists
words, classifications = zip(*classified_text)

print('Words:')
for el in words:
    print(el)
    
print('\nClassifications:')
for el in classifications:
    print(el)

Words:
Google
announces
takeover
of
Microsoft

Classifications:
ORGANIZATION
O
O
O
ORGANIZATION


In [38]:
data = pd.DataFrame(classifications, words)
data

Unnamed: 0,0
Google,ORGANIZATION
announces,O
takeover,O
of,O
Microsoft,ORGANIZATION
