# Informationen aus einem Text extrahieren

## Einteilung in Sätze und Wörter

In [2]:
import nltk
text = "This is an example. The example is really simple."
tokenized = [nltk.word_tokenize(text)]
print(tokenized)

[['This', 'is', 'an', 'example', '.', 'The', 'example', 'is', 'really', 'simple', '.']]


## Erkennung von Wortarten
- DT -> Artikel

- V* -> Verben in versch. Zeitformen, z.B. VBZ, VBN

- NN -> Nomen

- RB* -> Adverbien

- JJ -> Adjektiv

In [29]:
#Print out all availiable part of speech tags
print(nltk.help.upenn_tagset())

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [4]:
#Divide the example into part of speech tags
speech_tags = [nltk.pos_tag(token) for token in tokenized]
print(speech_tags)

[[('This', 'DT'), ('is', 'VBZ'), ('an', 'DT'), ('example', 'NN'), ('.', '.'), ('The', 'DT'), ('example', 'NN'), ('is', 'VBZ'), ('really', 'RB'), ('simple', 'JJ'), ('.', '.')]]


## Chunking / Einheiten gliedern

In [5]:
#Define a grammar for our chunks
grammar = "NP: {<DT>?<JJ>*<NN>}"

#Define chunk parser which includes the defined grammar
cp = nltk.RegexpParser(grammar)

#Parse
result = cp.parse(speech_tags[0])

#Print and draw result as a tree
print(result)
result.draw()

(S
  This/DT
  is/VBZ
  (NP an/DT example/NN)
  ./.
  (NP The/DT example/NN)
  is/VBZ
  really/RB
  simple/JJ
  ./.)


In [31]:
#define some other chunks, in this case just searching for determiner plus adjective
cp = nltk.RegexpParser('CHUNK: {<DT> <JJ>}')
brown = nltk.corpus.brown
for sentence in brown.tagged_sents():
    tree = cp.parse(sentence)
    for subtree in tree.subtrees():
        if subtree.label() == 'CHUNK': print(subtree)

(CHUNK each/DT medical/JJ)
(CHUNK this/DT disturbing/JJ)
(CHUNK another/DT major/JJ)
(CHUNK each/DT additional/JJ)
(CHUNK this/DT historic/JJ)
(CHUNK this/DT historic/JJ)
(CHUNK this/DT two-year/JJ)
(CHUNK another/DT splendid/JJ)
(CHUNK another/DT all-out/JJ)
(CHUNK another/DT 5-run/JJ)
(CHUNK this/DT early/JJ)
(CHUNK That/DT darlin'/JJ)
(CHUNK This/DT mad/JJ)
(CHUNK that/DT crazy/JJ)
(CHUNK this/DT vast/JJ)
(CHUNK that/DT bleak/JJ)
(CHUNK that/DT magic/JJ)
(CHUNK this/DT fine/JJ)
(CHUNK this/DT little/JJ)
(CHUNK this/DT regrettable/JJ)
(CHUNK that/DT historic/JJ)
(CHUNK This/DT year-to-year/JJ)
(CHUNK this/DT tremendous/JJ)
(CHUNK That/DT 60-day/JJ)
(CHUNK this/DT cultural/JJ)
(CHUNK that/DT Mexican/JJ)
(CHUNK another/DT real/JJ)
(CHUNK that/DT old/JJ)
(CHUNK this/DT three-hour/JJ)
(CHUNK This/DT excellent/JJ)
(CHUNK this/DT broad/JJ)
(CHUNK that/DT oft-repeated/JJ)
(CHUNK this/DT untrammeled/JJ)
(CHUNK that/DT congressional/JJ)
(CHUNK each/DT new/JJ)
(CHUNK this/DT remarkable/JJ)
(CH

(CHUNK This/DT new/JJ)
(CHUNK that/DT senior/JJ)
(CHUNK this/DT strong/JJ)
(CHUNK this/DT new/JJ)
(CHUNK this/DT new/JJ)
(CHUNK This/DT bold/JJ)
(CHUNK this/DT crass/JJ)
(CHUNK this/DT phenomenal/JJ)
(CHUNK this/DT vital/JJ)
(CHUNK this/DT fine/JJ)
(CHUNK this/DT life-death/JJ)
(CHUNK this/DT brief/JJ)
(CHUNK this/DT sexual/JJ)
(CHUNK this/DT disaffiliated/JJ)
(CHUNK This/DT strange/JJ)
(CHUNK this/DT little/JJ)
(CHUNK this/DT unknown/JJ)
(CHUNK this/DT terrible/JJ)
(CHUNK that/DT great/JJ)
(CHUNK that/DT fair/JJ)
(CHUNK this/DT final/JJ)
(CHUNK That/DT little/JJ)
(CHUNK That/DT little/JJ)
(CHUNK This/DT restless/JJ)
(CHUNK that/DT desperate/JJ)
(CHUNK this/DT unpromising/JJ)
(CHUNK this/DT curious/JJ)
(CHUNK this/DT idyllic/JJ)
(CHUNK this/DT fatal/JJ)
(CHUNK this/DT fatal/JJ)
(CHUNK This/DT organizational/JJ)
(CHUNK this/DT early/JJ)
(CHUNK that/DT utopian/JJ)
(CHUNK This/DT favorable/JJ)
(CHUNK this/DT new/JJ)
(CHUNK this/DT human/JJ)
(CHUNK this/DT natural/JJ)
(CHUNK this/DT questi

(CHUNK that/DT dear/JJ)
(CHUNK this/DT lonely/JJ)
(CHUNK Each/DT successive/JJ)
(CHUNK that/DT short/JJ)
(CHUNK that/DT dry/JJ)
(CHUNK this/DT broken-nosed/JJ)
(CHUNK this/DT half-grown/JJ)
(CHUNK another/DT important/JJ)
(CHUNK this/DT big/JJ)
(CHUNK that/DT fool/JJ)
(CHUNK that/DT dull/JJ)
(CHUNK another/DT sidelong/JJ)
(CHUNK that/DT threatening/JJ)
(CHUNK another/DT dim/JJ)
(CHUNK this/DT underground/JJ)
(CHUNK another/DT hot/JJ)
(CHUNK that/DT Yankee/JJ)
(CHUNK another/DT curious/JJ)
(CHUNK this/DT early/JJ)
(CHUNK this/DT startling/JJ)
(CHUNK this/DT unaccountable/JJ)
(CHUNK that/DT ridiculous/JJ)
(CHUNK that/DT childish/JJ)
(CHUNK that/DT crazy/JJ)
(CHUNK that/DT young/JJ)
(CHUNK that/DT old/JJ)
(CHUNK that/DT simple/JJ)
(CHUNK that/DT tall/JJ)
(CHUNK That/DT damn/JJ)
(CHUNK this/DT past/JJ)
(CHUNK this/DT alien/JJ)
(CHUNK this/DT big/JJ)
(CHUNK this/DT typical/JJ)
(CHUNK that/DT particular/JJ)
(CHUNK that/DT theatrical/JJ)
(CHUNK that/DT curious/JJ)
(CHUNK this/DT sad/JJ)
(CHUN

### Chinking - das Gegenteil von Chunking 

In [40]:
#Define a grammar where we exclude some part of speech tags from beeing a NP 
grammar = r"""
            NP: 
                {<.*>+} 
                }<DT> <VBZ>{
                }<VBZ> <RB> <JJ>{
                }<.>{
        """

#Define chunk parser which includes the defined grammar
cp = nltk.RegexpParser(grammar)

#Parse
result = cp.parse(speech_tags[0])

#Print and draw result as a tree
print(result)
result.draw()

(S
  This/DT
  is/VBZ
  (NP an/DT example/NN)
  ./.
  (NP The/DT example/NN)
  is/VBZ
  really/RB
  simple/JJ
  ./.)


### Entwickeln von Chunkers

In [44]:
from nltk.corpus import conll2000
#Trainings dataset
print(conll2000.chunked_sents('train.txt', chunk_types=['NP'])[99])

(S
  Over/IN
  (NP a/DT cup/NN)
  of/IN
  (NP coffee/NN)
  ,/,
  (NP Mr./NNP Stone/NNP)
  told/VBD
  (NP his/PRP$ story/NN)
  ./.)


#### Keine Chunking-Regeln

In [51]:
cp = nltk.RegexpParser("")
test_sents = conll2000.chunked_sents('test.txt', chunk_types=['NP'])
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  43.4%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


#### Einfache Grammatik

In [47]:
cp = nltk.RegexpParser("NP: {<DT>?<JJ>*<NN>}")
print(cp.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  59.7%%
    Precision:     45.3%%
    Recall:        24.2%%
    F-Measure:     31.6%%


#### Unigram Tagger
--> Statistische Analyse von Trainingsdaten

In [73]:
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = []
        for sentence in train_sents:
            #Reformatting 
            reformatted_sentence = nltk.chunk.tree2conlltags(sentence)

            #Extract the (tag, chunk) out of (tag, word, chunk) triple
            tag_chunk = [(t, c) for w,t,c in reformatted_sentence]
            train_data.append(tag_chunk)

        #Overgive the data to UnigramTagger
        self.tagger = nltk.UnigramTagger(train_data)
    def parse(self, sentence):
        #Extract part of speech tags
        pos_tags = [pos for (word,pos) in sentence]
        
        #Tag the sentence by recognizing part of speech patterns using the trained tagger
        tagged_pos_tags = self.tagger.tag(pos_tags)
        
        #Get the chunk tags 
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        #Combine the sentence and the determined chunk tags (IOB format)
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag) in zip(sentence, chunktags)]
        
        return nltk.chunk.conlltags2tree(conlltags)

#Train the unigram chunker
unigram_chunker = UnigramChunker(train_sents)

print(unigram_chunker.evaluate(test_sents))

ChunkParse score:
    IOB Accuracy:  92.9%%
    Precision:     79.9%%
    Recall:        86.8%%
    F-Measure:     83.2%%


## Named entities

In [18]:
sent = nltk.corpus.treebank.tagged_sents()[22]
print(nltk.ne_chunk(sent, binary=False))

(S
  The/DT
  (GPE U.S./NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  few/JJ
  industrialized/VBN
  nations/NNS
  that/WDT
  *T*-7/-NONE-
  does/VBZ
  n't/RB
  have/VB
  a/DT
  higher/JJR
  standard/NN
  of/IN
  regulation/NN
  for/IN
  the/DT
  smooth/JJ
  ,/,
  needle-like/JJ
  fibers/NNS
  such/JJ
  as/IN
  crocidolite/NN
  that/WDT
  *T*-1/-NONE-
  are/VBP
  classified/VBN
  *-5/-NONE-
  as/IN
  amphobiles/NNS
  ,/,
  according/VBG
  to/TO
  (PERSON Brooke/NNP T./NNP Mossman/NNP)
  ,/,
  a/DT
  professor/NN
  of/IN
  pathlogy/NN
  at/IN
  the/DT
  (ORGANIZATION University/NNP)
  of/IN
  (PERSON Vermont/NNP College/NNP)
  of/IN
  (GPE Medicine/NNP)
  ./.)


## Beziehungen erkennen 
Beziehungen in Texten erkennen. 

In [5]:
import re
#Defining a regular expr.
#TODO: What does this regex mean?
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
#Go throught all docs in NYT from 1998 03 15 
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
    #Get each relation between ORG and LOC which follows the pattern IN
    #[ORG] ... in [LOC]
    for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
