In [1]:
# pip install nltk 
# from the pop up window install: punkt, Averaged perceptron Tagger
# nltk.download()

In [2]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import StanfordNERTagger
import os
import pandas as pd

In [3]:
sentence = "Barclays employees are haunted by a U.K. doctor’s alleged sexual assaults"

tokens = nltk.word_tokenize(sentence)
tokens

['Barclays',
 'employees',
 'are',
 'haunted',
 'by',
 'a',
 'U.K.',
 'doctor',
 '’',
 's',
 'alleged',
 'sexual',
 'assaults']

In [4]:
tagged = nltk.pos_tag(tokens)
tagged

[('Barclays', 'NNS'),
 ('employees', 'NNS'),
 ('are', 'VBP'),
 ('haunted', 'VBN'),
 ('by', 'IN'),
 ('a', 'DT'),
 ('U.K.', 'NNP'),
 ('doctor', 'NN'),
 ('’', 'NNP'),
 ('s', 'NN'),
 ('alleged', 'VBD'),
 ('sexual', 'JJ'),
 ('assaults', 'NNS')]

In [5]:
phrases = sent_tokenize(sentence)
words = word_tokenize(sentence)
 
print(phrases)
print(words)

['Barclays employees are haunted by a U.K. doctor’s alleged sexual assaults']
['Barclays', 'employees', 'are', 'haunted', 'by', 'a', 'U.K.', 'doctor', '’', 's', 'alleged', 'sexual', 'assaults']


In [13]:
stopWords = set(stopwords.words('english'))
filtered_words = [w for w in words if w not in stopWords]
 
print(filtered_words)

['Goldman', 'Sachs', 'employees', 'haunted', 'UK', 'doctors', 'alleged', 'sexual', 'assaults']


In [14]:
ps = PorterStemmer()
 
for word in words:
    print(ps.stem(word))

goldman
sach
employe
are
haunt
by
a
UK
doctor
alleg
sexual
assault


In [15]:
# Francesco's settings
# java_path = "/Users/macbookpro/Downloads/jdk-13.0.1.jdk/Contents/Home/bin/java"

# Enter your username in here. I've put the paths you wrote as an attribute in the dictionary below
user = 'Conor'

# Dictionary containing the Java paths for each user
java_paths = {
    'Conor': '/usr/lib/jvm/java-8-openjdk-amd64',
    'Francesco': '/Users/macbookpro/Downloads/jdk-13.0.1.jdk/Contents/Home/bin/java',
    'Cecilia': 'C:/Program Files/Java/jdk-13.0.1/bin/java.exe'
}

os.environ['JAVAHOME'] = java_paths[user]

st = StanfordNERTagger(
    'stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
    'stanford-ner-2014-06-16/stanford-ner.jar',
    encoding='utf-8'
)

# Previous settings - updated to refer to version as in the GitHub repo so this should run for everyone
# st = StanfordNERTagger('/Users/macbookpro/Downloads/stanford-ner-2014-06-16/classifiers/english.all.3class.distsim.crf.ser.gz',
#                           '/Users/macbookpro/Downloads/stanford-ner-2014-06-16/stanford-ner.jar',
#                         encoding='utf-8')

In [17]:
# -*- coding: utf-8 -*-
text = 'While in France, Christine Lagarde discussed short-term stimulus efforts in a recent interview with the WALL STREET Journal.'
sentence = "Goldman Sachs employees are haunted by a UK doctors alleged sexual assaults"
tokenized_text = word_tokenize(sentence)
classified_text = st.tag(tokenized_text)

print('Tokenized Text:')
for el in tokenized_text:
    print(el)

print('\nClassified Text:')
for el in classified_text:
    print(el)

Tokenized Text:
Goldman
Sachs
employees
are
haunted
by
a
UK
doctors
alleged
sexual
assaults

Classified Text:
('Goldman', 'ORGANIZATION')
('Sachs', 'ORGANIZATION')
('employees', 'O')
('are', 'O')
('haunted', 'O')
('by', 'O')
('a', 'O')
('UK', 'LOCATION')
('doctors', 'O')
('alleged', 'O')
('sexual', 'O')
('assaults', 'O')


In [25]:
# Bit python magic to separate the list of tuples into two separate lists
words, classifications = zip(*classified_text)

print('Words:')
for el in words:
    print(el)
    
print('\nClassifications:')
for el in classifications:
    print(el)

Words:
Goldman
Sachs
employees
are
haunted
by
a
UK
doctors
alleged
sexual
assaults

Classifications:
ORGANIZATION
ORGANIZATION
O
O
O
O
O
LOCATION
O
O
O
O


In [27]:
data = pd.DataFrame(classifications, words)
data

Unnamed: 0,0
Goldman,ORGANIZATION
Sachs,ORGANIZATION
employees,O
are,O
haunted,O
by,O
a,O
UK,LOCATION
doctors,O
alleged,O
